File size: 10,541 Bytes
4997aeb
 
da863bf
3ae066d
 
0bff6fd
940c185
517f1a0
f06193e
bb5b4ac
26a8844
 
9c2d532
4997aeb
0fb4cf5
4997aeb
8eb3e51
4997aeb
2da6f20
4997aeb
 
2da6f20
4997aeb
2da6f20
 
4997aeb
2da6f20
 
 
4997aeb
 
 
 
 
 
e496258
d9af0b6
4997aeb
 
8ce3d9b
4997aeb
 
9eb3e78
11bc07e
9eb3e78
dcca063
 
9eb3e78
 
 
 
 
79497a3
c9dd21c
9eb3e78
2376b2f
64523d8
 
 
 
 
9eb3e78
 
 
1b75632
9eb3e78
 
c9dd21c
9eb3e78
 
 
 
 
1b75632
9eb3e78
 
f76455a
64523d8
9eb3e78
 
26a8844
 
 
 
a1f90e6
 
 
 
 
 
 
0a8201e
a1f90e6
 
 
26a8844
a1f90e6
 
abef2ac
a1f90e6
 
8c993f6
065cb17
a1f90e6
c15629f
a1f90e6
26a8844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455de4c
26a8844
 
 
 
4baf582
8ce3d9b
9eb3e78
 
5d2299c
9eb3e78
8ce3d9b
 
 
 
26a8844
 
dcb00f7
3d67d69
dcb00f7
a1f90e6
a969331
 
 
a1f90e6
a969331
c2e5bed
a969331
 
78c999a
a969331
 
 
 
909aec0
64523d8
e488916
160526f
 
 
 
26a8844
544f3f0
150b8d9
909aec0
 
 
26a8844
909aec0
 
6725ef7
8ce3d9b
 
 
 
23247c4
8ce3d9b
 
 
 
9eb3e78
8a3a5d7
940c185
 
 
 
f06193e
 
 
 
 
80682f3
f06193e
80682f3
f06193e
517f1a0
f06193e
7c92df9
 
 
 
 
 
 
 
ec7fc78
 
517f1a0
609ef88
 
 
 
 
 
 
517f1a0
 
 
 
 
 
a78496d
 
 
db780e3
a78496d
 
517f1a0
a78496d
517f1a0
 
 
a78496d
517f1a0
a78496d
517f1a0
 
 
 
 
 
 
 
 
ec7fc78
 
 
 
 
d34a703
792de2f
c15629f
7a0f54c
 
 
c15629f
 
a1f90e6
81d7170
792de2f
61c3232
ec7fc78
7d43644
 
 
 
 
39dae03
 
7d43644
 
 
 
 
 
940c185
 
609ef88
ec7fc78
 
 
 
517f1a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import streamlit as st
import os
from streamlit_chat import message
import numpy as np
import pandas as pd
from io import StringIO
import PyPDF2
from tqdm.auto import tqdm
import math
from transformers import pipeline
from langchain.prompts import ChatPromptTemplate
import re
# import json

# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")

# from datasets import load_dataset

# dataset = load_dataset("wikipedia", "20220301.en", split="train[240000:250000]")


# wikidata = []

# for record in dataset:
#     wikidata.append(record["text"])

# wikidata = list(set(wikidata))
# # print("\n".join(wikidata[:5]))
# # print(len(wikidata))

from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device != 'cuda':
    st.markdown(f"Note: Using {device}. Expected slow responses compare to CUDA-enabled GPU. Please be patient thanks")

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
st.divider()

# Creating a Index(Pinecone Vector Database)
import os
# import pinecone

from pinecone.grpc import PineconeGRPC


PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")

# pc = PineconeGRPC( api_key=os.environ.get("PINECONE_API_KEY") ) # Now do stuff if 'my_index' not in pc.list_indexes().names(): pc.create_index( name='my_index', dimension=1536, metric='euclidean', spec=ServerlessSpec( cloud='aws', region='us-west-2' ) )

def connect_pinecone():
    pinecone = PineconeGRPC(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    # st.code(pinecone)
    # st.divider()
    # st.text(pinecone.list_indexes().names())
    # st.divider()
    # st.text(f"Succesfully connected to the pinecone")
    return pinecone

def get_pinecone_semantic_index(pinecone):
    index_name = "sematic-search-index"

    # only create if it deosnot exists
    if index_name not in pinecone.list_indexes().names():
        pinecone.create_index(
            name=index_name,
            description="Semantic search",
            dimension=model.get_sentence_embedding_dimension(),
            metric="cosine",
            spec=ServerlessSpec( cloud='aws', region='us-east-1' )
        )
    # now connect to index
    index = pinecone.Index(index_name)
    # st.text(f"Succesfully connected to the pinecone index")
    return index



def promt_engineer(text, query):
    summary_prompt_template = """
    write a concise summary of the following text delimited by triple backquotes.
    return your response in bullet points which convers the key points of the text.

    ```{text}```

    BULLET POINT SUMMARY:
    """
    # Load the summarization pipeline with the specified model
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    # Generate the prompt
    prompt = summary_prompt_template.format(text=text)

    # Generate the summary
    summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
    
    with st.sidebar:
        st.divider()
        st.markdown("*:red[Text Summary Generation]* from above Top 5 **:green[similarity search results]**.")
        st.write(summary)
        st.divider()

    GENERATION_PROMPT_TEMPLATE = """
    Instructions:
    -------------------------------------------------------------------------------------------------------------------------------
    Answer the question only based on the below context:
    - You're a Research AI expert in the explaining and reading the research papers.
    - Questions with out-of-context replay with The question is out of context. 
    - Always try to provide Keep it simple answers in nice format without incomplete sentence.
    - Give the answer atleast 5 seperate lines addition to the title info.
    - Only If question is relevent to context provide Doc Title: <title> Paragraph: <Paragraph> Page No: <pagenumber> 
    -------------------------------------------------------------------------------------------------------------------------------
    {context}
    -------------------------------------------------------------------------------------------------------------------------------
    Answer the question based on the above context: {question}
    """

    prompt_template = ChatPromptTemplate.from_template(GENERATION_PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=text, question=query)
    response_text = ""
    result = ""
    
    try:
        llm = HuggingFaceHub(
            repo_id="meta-llama/Meta-Llama-3-8B-Instruct", model_kwargs={"temperature": 0.1, "max_new_tokens": 256, "task":"text-generation"}
        )
        response_text = llm.invoke(prompt)
        escaped_query = re.escape(query)
        result = re.split(f'Answer the question based on the above context: {escaped_query}\n',response_text)[-1]
        st.write(result)
    except Exception as e:
        st.error(f"Error invoke: {e}")

    return summary, result

def chat_actions():
    
    pinecone = connect_pinecone()
    index = get_pinecone_semantic_index(pinecone)

    st.session_state["chat_history"].append(
        {"role": "user", "content": st.session_state["chat_input"]},
    )

    query = st.session_state["chat_input"]
    query_embedding = model.encode(query)
    # create the query vector
    query_vector = query_embedding.tolist()
    # now query vector database
    result = index.query(query_vector, top_k=5, include_metadata=True)  # result is a list of tuples

    # Create a list of lists
    data = []
    consolidated_text = ""
    i = 0
    for res in result['matches']:
        i = i + 1
        data.append([f"{i}⭐", res['score'], res['metadata']['text']])
        consolidated_text += res['metadata']['text']

    # Create a DataFrame from the list of lists
    resdf = pd.DataFrame(data, columns=['TopRank', 'Score', 'Text'])

    with st.sidebar:
        st.markdown("*:red[semantic search results]* with **:green[Retrieval Augmented Generation]** ***(RAG)***.")
        st.dataframe(resdf)
        bytesize = consolidated_text.encode("utf-8")
        p = math.pow(1024, 2)
        mbsize = round(len(bytesize) / p, 2)
        st.write(f"Text lenth of {len(consolidated_text)} characters with {mbsize}MB size")
        summary, response = promt_engineer(consolidated_text[:1024], query)

    for res in result['matches']:
        st.session_state["chat_history"].append(
            {
                "role": "assistant",
                "content": f"{response}",
            },  # This can be replaced with your chat response logic
        )
        break;

if "chat_history" not in st.session_state:
    st.session_state["chat_history"] = []

st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")

for i in st.session_state["chat_history"]:
    with st.chat_message(name=i["role"]):
        st.write(i["content"])

def print_out(pages):
    for i in range(len(pages)):
        text = pages[i].extract_text().strip()
        st.write(f"Page {i} : {text}")

def combine_text(pages):
    concatenates_text = ""
    for page in tqdm(pages):
        text = page.extract_text().strip()
        concatenates_text += text
    bytesize = concatenates_text.encode("utf-8")
    p = math.pow(1024, 2)
    mbsize = round(len(bytesize) / p, 2)
    st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
    return concatenates_text

def split_into_chunks(text, chunk_size):

    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])

    return chunks

def create_embeddings():
    # Get the uploaded file
    inputtext = ""
    with st.sidebar:
        uploaded_files = st.session_state["uploaded_files"]
        for uploaded_file in uploaded_files:
            # Read the contents of the file
            reader = PyPDF2.PdfReader(uploaded_file)
            pages = reader.pages
            print_out(pages)
            inputtext = combine_text(pages)

    # connect to pinecone index
    pinecone = connect_pinecone()
    index = get_pinecone_semantic_index(pinecone)

    # The maximum metadata size per vector is 40KB ~ 40000Bytes ~ each text character is 1 to 2 bytes. so rougly given chunk size of 10000 to 40000
    chunk_size = 10000
    batch_size = 2
    chunks = split_into_chunks(inputtext, chunk_size)

    for i in tqdm(range(0, len(chunks), batch_size)):
        # find end of batch
        end = min(i + batch_size, len(chunks))
        # create ids batch
        ids = [str(i) for i in range(i, end)]
        # create metadata batch
        metadata = [{"text": text} for text in chunks[i:end]]
        # create embeddings
        xc = model.encode(chunks[i:end])
        # create records list for upsert
        records = zip(ids, xc, metadata)
        # upsert records
        index.upsert(vectors=records)

    with st.sidebar:
        st.write("created vector embeddings!")
        # check no of records in the index
        st.write(f"{index.describe_index_stats()}")


    # Display the contents of the file
    # st.write(file_contents)

with st.sidebar:
    st.markdown("""
    ***:red[Follow this steps]***
    - upload pdf file to create embeddings using model on your own docs
    - wait see success message on embeddings creation 
    - It Takes couple of mins after upload the pdf
    - Now Chat with your documents with help of this RAG system 
    - It Generate Promted reponses on the upload pdf
    - Provides summarized results and QA's using GPT models
    - This system already trained on some wikipedia datasets too
    """)
    uploaded_files = st.file_uploader('Choose your .pdf file', type="pdf", accept_multiple_files=True, key="uploaded_files", on_change=create_embeddings)
    # for uploaded_file in uploaded_files:
        # To read file as bytes:
        # bytes_data = uploaded_file.getvalue()
        # st.write(bytes_data)

        # To convert to a string based IO:
        # stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        # st.write(stringio)

        # To read file as string:
        # string_data = stringio.read()
        # st.write(string_data)

        # Can be used wherever a "file-like" object is accepted:
        # dataframe = pd.read_csv(uploaded_file)
        # st.write(dataframe)

        # reader = PyPDF2.PdfReader(uploaded_file)
        # pages = reader.pages
        # print_out(pages)
        # combine_text(pages)
        # promt_engineer(text)