Spaces:
Runtime error
Runtime error
File size: 10,812 Bytes
4997aeb da863bf 3ae066d 0bff6fd 940c185 517f1a0 f06193e bb5b4ac 26a8844 0f3245e 26a8844 9c2d532 4997aeb 0fb4cf5 4997aeb 8eb3e51 4997aeb 2da6f20 4997aeb 2da6f20 4997aeb 2da6f20 4997aeb 2da6f20 4997aeb e496258 d9af0b6 4997aeb 8ce3d9b 4997aeb 9eb3e78 11bc07e 9eb3e78 dcca063 9eb3e78 79497a3 c9dd21c a695e9a 9eb3e78 2376b2f 64523d8 9eb3e78 1b75632 9eb3e78 c9dd21c 9eb3e78 1b75632 9eb3e78 f76455a 64523d8 9eb3e78 26a8844 a695e9a 26a8844 a1f90e6 0a8201e a1f90e6 26a8844 a1f90e6 abef2ac a1f90e6 8c993f6 065cb17 a1f90e6 c15629f a1f90e6 26a8844 a695e9a 26a8844 a695e9a 26a8844 4baf582 8ce3d9b 9eb3e78 5d2299c 9eb3e78 8ce3d9b 26a8844 dcb00f7 3d67d69 dcb00f7 a1f90e6 a969331 a1f90e6 a969331 c2e5bed a969331 78c999a a969331 909aec0 64523d8 e488916 160526f def373d a695e9a 544f3f0 150b8d9 909aec0 26a8844 909aec0 6725ef7 8ce3d9b 23247c4 8ce3d9b 9eb3e78 8a3a5d7 940c185 f06193e 80682f3 f06193e 80682f3 f06193e 517f1a0 f06193e 7c92df9 ec7fc78 517f1a0 609ef88 517f1a0 a78496d db780e3 a78496d 517f1a0 a78496d 517f1a0 a78496d 517f1a0 a78496d 517f1a0 ec7fc78 d34a703 792de2f c15629f 7a0f54c c15629f a1f90e6 81d7170 792de2f 61c3232 ec7fc78 7d43644 39dae03 7d43644 940c185 609ef88 ec7fc78 517f1a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 |
import streamlit as st
import os
from streamlit_chat import message
import numpy as np
import pandas as pd
from io import StringIO
import PyPDF2
from tqdm.auto import tqdm
import math
from transformers import pipeline
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFaceHub
import re
# import json
# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
# from datasets import load_dataset
# dataset = load_dataset("wikipedia", "20220301.en", split="train[240000:250000]")
# wikidata = []
# for record in dataset:
# wikidata.append(record["text"])
# wikidata = list(set(wikidata))
# # print("\n".join(wikidata[:5]))
# # print(len(wikidata))
from sentence_transformers import SentenceTransformer
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
st.markdown(f"Note: Using {device}. Expected slow responses compare to CUDA-enabled GPU. Please be patient thanks")
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
st.divider()
# Creating a Index(Pinecone Vector Database)
import os
# import pinecone
from pinecone.grpc import PineconeGRPC
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")
# pc = PineconeGRPC( api_key=os.environ.get("PINECONE_API_KEY") ) # Now do stuff if 'my_index' not in pc.list_indexes().names(): pc.create_index( name='my_index', dimension=1536, metric='euclidean', spec=ServerlessSpec( cloud='aws', region='us-west-2' ) )
# Load environment variables from .env file
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
def connect_pinecone():
pinecone = PineconeGRPC(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
# st.code(pinecone)
# st.divider()
# st.text(pinecone.list_indexes().names())
# st.divider()
# st.text(f"Succesfully connected to the pinecone")
return pinecone
def get_pinecone_semantic_index(pinecone):
index_name = "sematic-search-index"
# only create if it deosnot exists
if index_name not in pinecone.list_indexes().names():
pinecone.create_index(
name=index_name,
description="Semantic search",
dimension=model.get_sentence_embedding_dimension(),
metric="cosine",
spec=ServerlessSpec( cloud='aws', region='us-east-1' )
)
# now connect to index
index = pinecone.Index(index_name)
# st.text(f"Succesfully connected to the pinecone index")
return index
def prompt_engineer(text, longtext, query):
summary_prompt_template = """
write a concise summary of the following text delimited by triple backquotes.
return your response in bullet points which convers the key points of the text.
```{text}```
BULLET POINT SUMMARY:
"""
# Load the summarization pipeline with the specified model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Generate the prompt
prompt = summary_prompt_template.format(text=text)
# Generate the summary
summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
with st.sidebar:
st.divider()
st.markdown("*:red[Text Summary Generation]* from above Top 5 **:green[similarity search results]**.")
st.write(summary)
st.divider()
GENERATION_PROMPT_TEMPLATE = """
Instructions:
-------------------------------------------------------------------------------------------------------------------------------
Answer the question only based on the below context:
- You're a Research AI expert in the explaining and reading the research papers.
- Questions with out-of-context replay with The question is out of context.
- Always try to provide Keep it simple answers in nice format without incomplete sentence.
- Give the answer atleast 5 seperate lines addition to the title info.
- Only If question is relevent to context provide Doc Title: <title> Paragraph: <Paragraph> Page No: <pagenumber>
-------------------------------------------------------------------------------------------------------------------------------
{context}
-------------------------------------------------------------------------------------------------------------------------------
Answer the question based on the above context: {question}
"""
prompt_template = ChatPromptTemplate.from_template(GENERATION_PROMPT_TEMPLATE)
prompt = prompt_template.format(context=longtext, question=query)
response_text = ""
result = ""
try:
llm = HuggingFaceHub(
repo_id="meta-llama/Meta-Llama-3-8B-Instruct", model_kwargs={"temperature": 0.1, "max_new_tokens": 256, "task":"text-generation"}
)
response_text = llm.invoke(prompt)
escaped_query = re.escape(query)
result = re.split(f'Answer the question based on the above context: {escaped_query}\n',response_text)[-1]
st.write("reponse generated see chat window ππ»")
st.divider()
except Exception as e:
st.error(f"Error invoke: {e}")
return summary, result
def chat_actions():
pinecone = connect_pinecone()
index = get_pinecone_semantic_index(pinecone)
st.session_state["chat_history"].append(
{"role": "user", "content": st.session_state["chat_input"]},
)
query = st.session_state["chat_input"]
query_embedding = model.encode(query)
# create the query vector
query_vector = query_embedding.tolist()
# now query vector database
result = index.query(query_vector, top_k=5, include_metadata=True) # result is a list of tuples
# Create a list of lists
data = []
consolidated_text = ""
i = 0
for res in result['matches']:
i = i + 1
data.append([f"{i}β", res['score'], res['metadata']['text']])
consolidated_text += res['metadata']['text']
# Create a DataFrame from the list of lists
resdf = pd.DataFrame(data, columns=['TopRank', 'Score', 'Text'])
with st.sidebar:
st.markdown("*:red[semantic search results]* with **:green[Retrieval Augmented Generation]** ***(RAG)***.")
st.dataframe(resdf)
bytesize = consolidated_text.encode("utf-8")
p = math.pow(1024, 2)
mbsize = round(len(bytesize) / p, 2)
st.write(f"Text length of {len(consolidated_text)} characters with {mbsize}MB size")
summary, response = prompt_engineer(consolidated_text[:1024], consolidated_text, query)
for res in result['matches']:
st.session_state["chat_history"].append(
{
"role": "assistant",
"content": f"{response}",
}, # This can be replaced with your chat response logic
)
break;
if "chat_history" not in st.session_state:
st.session_state["chat_history"] = []
st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
for i in st.session_state["chat_history"]:
with st.chat_message(name=i["role"]):
st.write(i["content"])
def print_out(pages):
for i in range(len(pages)):
text = pages[i].extract_text().strip()
st.write(f"Page {i} : {text}")
def combine_text(pages):
concatenates_text = ""
for page in tqdm(pages):
text = page.extract_text().strip()
concatenates_text += text
bytesize = concatenates_text.encode("utf-8")
p = math.pow(1024, 2)
mbsize = round(len(bytesize) / p, 2)
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
return concatenates_text
def split_into_chunks(text, chunk_size):
chunks = []
for i in range(0, len(text), chunk_size):
chunks.append(text[i:i + chunk_size])
return chunks
def create_embeddings():
# Get the uploaded file
inputtext = ""
with st.sidebar:
uploaded_files = st.session_state["uploaded_files"]
for uploaded_file in uploaded_files:
# Read the contents of the file
reader = PyPDF2.PdfReader(uploaded_file)
pages = reader.pages
print_out(pages)
inputtext = combine_text(pages)
# connect to pinecone index
pinecone = connect_pinecone()
index = get_pinecone_semantic_index(pinecone)
# The maximum metadata size per vector is 40KB ~ 40000Bytes ~ each text character is 1 to 2 bytes. so rougly given chunk size of 10000 to 40000
chunk_size = 10000
batch_size = 2
chunks = split_into_chunks(inputtext, chunk_size)
for i in tqdm(range(0, len(chunks), batch_size)):
# find end of batch
end = min(i + batch_size, len(chunks))
# create ids batch
ids = [str(i) for i in range(i, end)]
# create metadata batch
metadata = [{"text": text} for text in chunks[i:end]]
# create embeddings
xc = model.encode(chunks[i:end])
# create records list for upsert
records = zip(ids, xc, metadata)
# upsert records
index.upsert(vectors=records)
with st.sidebar:
st.write("created vector embeddings!")
# check no of records in the index
st.write(f"{index.describe_index_stats()}")
# Display the contents of the file
# st.write(file_contents)
with st.sidebar:
st.markdown("""
***:red[Follow this steps]***
- upload pdf file to create embeddings using model on your own docs
- wait see success message on embeddings creation
- It Takes couple of mins after upload the pdf
- Now Chat with your documents with help of this RAG system
- It Generate Promted reponses on the upload pdf
- Provides summarized results and QA's using GPT models
- This system already trained on some wikipedia datasets too
""")
uploaded_files = st.file_uploader('Choose your .pdf file', type="pdf", accept_multiple_files=True, key="uploaded_files", on_change=create_embeddings)
# for uploaded_file in uploaded_files:
# To read file as bytes:
# bytes_data = uploaded_file.getvalue()
# st.write(bytes_data)
# To convert to a string based IO:
# stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
# st.write(stringio)
# To read file as string:
# string_data = stringio.read()
# st.write(string_data)
# Can be used wherever a "file-like" object is accepted:
# dataframe = pd.read_csv(uploaded_file)
# st.write(dataframe)
# reader = PyPDF2.PdfReader(uploaded_file)
# pages = reader.pages
# print_out(pages)
# combine_text(pages)
# promt_engineer(text) |