chat-with-docs / app.py
KushwanthK's picture
Update app.py
def373d verified
raw
history blame
10.5 kB
import streamlit as st
import os
from streamlit_chat import message
import numpy as np
import pandas as pd
from io import StringIO
import PyPDF2
from tqdm.auto import tqdm
import math
from transformers import pipeline
from langchain.prompts import ChatPromptTemplate
import re
# import json
# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
# from datasets import load_dataset
# dataset = load_dataset("wikipedia", "20220301.en", split="train[240000:250000]")
# wikidata = []
# for record in dataset:
# wikidata.append(record["text"])
# wikidata = list(set(wikidata))
# # print("\n".join(wikidata[:5]))
# # print(len(wikidata))
from sentence_transformers import SentenceTransformer
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
st.markdown(f"Note: Using {device}. Expected slow responses compare to CUDA-enabled GPU. Please be patient thanks")
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
st.divider()
# Creating a Index(Pinecone Vector Database)
import os
# import pinecone
from pinecone.grpc import PineconeGRPC
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")
# pc = PineconeGRPC( api_key=os.environ.get("PINECONE_API_KEY") ) # Now do stuff if 'my_index' not in pc.list_indexes().names(): pc.create_index( name='my_index', dimension=1536, metric='euclidean', spec=ServerlessSpec( cloud='aws', region='us-west-2' ) )
def connect_pinecone():
pinecone = PineconeGRPC(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
# st.code(pinecone)
# st.divider()
# st.text(pinecone.list_indexes().names())
# st.divider()
# st.text(f"Succesfully connected to the pinecone")
return pinecone
def get_pinecone_semantic_index(pinecone):
index_name = "sematic-search-index"
# only create if it deosnot exists
if index_name not in pinecone.list_indexes().names():
pinecone.create_index(
name=index_name,
description="Semantic search",
dimension=model.get_sentence_embedding_dimension(),
metric="cosine",
spec=ServerlessSpec( cloud='aws', region='us-east-1' )
)
# now connect to index
index = pinecone.Index(index_name)
# st.text(f"Succesfully connected to the pinecone index")
return index
def prompt_engineer(text, query):
summary_prompt_template = """
write a concise summary of the following text delimited by triple backquotes.
return your response in bullet points which convers the key points of the text.
```{text}```
BULLET POINT SUMMARY:
"""
# Load the summarization pipeline with the specified model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Generate the prompt
prompt = summary_prompt_template.format(text=text)
# Generate the summary
summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
with st.sidebar:
st.divider()
st.markdown("*:red[Text Summary Generation]* from above Top 5 **:green[similarity search results]**.")
st.write(summary)
st.divider()
GENERATION_PROMPT_TEMPLATE = """
Instructions:
-------------------------------------------------------------------------------------------------------------------------------
Answer the question only based on the below context:
- You're a Research AI expert in the explaining and reading the research papers.
- Questions with out-of-context replay with The question is out of context.
- Always try to provide Keep it simple answers in nice format without incomplete sentence.
- Give the answer atleast 5 seperate lines addition to the title info.
- Only If question is relevent to context provide Doc Title: <title> Paragraph: <Paragraph> Page No: <pagenumber>
-------------------------------------------------------------------------------------------------------------------------------
{context}
-------------------------------------------------------------------------------------------------------------------------------
Answer the question based on the above context: {question}
"""
prompt_template = ChatPromptTemplate.from_template(GENERATION_PROMPT_TEMPLATE)
prompt = prompt_template.format(context=text, question=query)
response_text = ""
result = ""
try:
llm = HuggingFaceHub(
repo_id="meta-llama/Meta-Llama-3-8B-Instruct", model_kwargs={"temperature": 0.1, "max_new_tokens": 256, "task":"text-generation"}
)
response_text = llm.invoke(prompt)
escaped_query = re.escape(query)
result = re.split(f'Answer the question based on the above context: {escaped_query}\n',response_text)[-1]
st.write(result)
except Exception as e:
st.error(f"Error invoke: {e}")
return summary, result
def chat_actions():
pinecone = connect_pinecone()
index = get_pinecone_semantic_index(pinecone)
st.session_state["chat_history"].append(
{"role": "user", "content": st.session_state["chat_input"]},
)
query = st.session_state["chat_input"]
query_embedding = model.encode(query)
# create the query vector
query_vector = query_embedding.tolist()
# now query vector database
result = index.query(query_vector, top_k=5, include_metadata=True) # result is a list of tuples
# Create a list of lists
data = []
consolidated_text = ""
i = 0
for res in result['matches']:
i = i + 1
data.append([f"{i}⭐", res['score'], res['metadata']['text']])
consolidated_text += res['metadata']['text']
# Create a DataFrame from the list of lists
resdf = pd.DataFrame(data, columns=['TopRank', 'Score', 'Text'])
with st.sidebar:
st.markdown("*:red[semantic search results]* with **:green[Retrieval Augmented Generation]** ***(RAG)***.")
st.dataframe(resdf)
bytesize = consolidated_text.encode("utf-8")
p = math.pow(1024, 2)
mbsize = round(len(bytesize) / p, 2)
st.write(f"Text length of {len(consolidated_text)} characters with {mbsize}MB size")
summary, response = prompt_engineer(consolidated_text[:1024], query)
for res in result['matches']:
st.session_state["chat_history"].append(
{
"role": "assistant",
"content": f"{response}",
}, # This can be replaced with your chat response logic
)
break;
if "chat_history" not in st.session_state:
st.session_state["chat_history"] = []
st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
for i in st.session_state["chat_history"]:
with st.chat_message(name=i["role"]):
st.write(i["content"])
def print_out(pages):
for i in range(len(pages)):
text = pages[i].extract_text().strip()
st.write(f"Page {i} : {text}")
def combine_text(pages):
concatenates_text = ""
for page in tqdm(pages):
text = page.extract_text().strip()
concatenates_text += text
bytesize = concatenates_text.encode("utf-8")
p = math.pow(1024, 2)
mbsize = round(len(bytesize) / p, 2)
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
return concatenates_text
def split_into_chunks(text, chunk_size):
chunks = []
for i in range(0, len(text), chunk_size):
chunks.append(text[i:i + chunk_size])
return chunks
def create_embeddings():
# Get the uploaded file
inputtext = ""
with st.sidebar:
uploaded_files = st.session_state["uploaded_files"]
for uploaded_file in uploaded_files:
# Read the contents of the file
reader = PyPDF2.PdfReader(uploaded_file)
pages = reader.pages
print_out(pages)
inputtext = combine_text(pages)
# connect to pinecone index
pinecone = connect_pinecone()
index = get_pinecone_semantic_index(pinecone)
# The maximum metadata size per vector is 40KB ~ 40000Bytes ~ each text character is 1 to 2 bytes. so rougly given chunk size of 10000 to 40000
chunk_size = 10000
batch_size = 2
chunks = split_into_chunks(inputtext, chunk_size)
for i in tqdm(range(0, len(chunks), batch_size)):
# find end of batch
end = min(i + batch_size, len(chunks))
# create ids batch
ids = [str(i) for i in range(i, end)]
# create metadata batch
metadata = [{"text": text} for text in chunks[i:end]]
# create embeddings
xc = model.encode(chunks[i:end])
# create records list for upsert
records = zip(ids, xc, metadata)
# upsert records
index.upsert(vectors=records)
with st.sidebar:
st.write("created vector embeddings!")
# check no of records in the index
st.write(f"{index.describe_index_stats()}")
# Display the contents of the file
# st.write(file_contents)
with st.sidebar:
st.markdown("""
***:red[Follow this steps]***
- upload pdf file to create embeddings using model on your own docs
- wait see success message on embeddings creation
- It Takes couple of mins after upload the pdf
- Now Chat with your documents with help of this RAG system
- It Generate Promted reponses on the upload pdf
- Provides summarized results and QA's using GPT models
- This system already trained on some wikipedia datasets too
""")
uploaded_files = st.file_uploader('Choose your .pdf file', type="pdf", accept_multiple_files=True, key="uploaded_files", on_change=create_embeddings)
# for uploaded_file in uploaded_files:
# To read file as bytes:
# bytes_data = uploaded_file.getvalue()
# st.write(bytes_data)
# To convert to a string based IO:
# stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
# st.write(stringio)
# To read file as string:
# string_data = stringio.read()
# st.write(string_data)
# Can be used wherever a "file-like" object is accepted:
# dataframe = pd.read_csv(uploaded_file)
# st.write(dataframe)
# reader = PyPDF2.PdfReader(uploaded_file)
# pages = reader.pages
# print_out(pages)
# combine_text(pages)
# promt_engineer(text)