Spaces:

KushwanthK
/

chat-with-docs

Runtime error

File size: 7,439 Bytes

4997aeb
 
da863bf
3ae066d
 
0bff6fd
940c185
 
f06193e
a1f90e6
9c2d532
4997aeb
0fb4cf5
4997aeb
8eb3e51
4997aeb
2da6f20
4997aeb
 
2da6f20
4997aeb
2da6f20
 
4997aeb
2da6f20
 
 
4997aeb
 
 
 
 
 
e496258
 
 
 
4997aeb
 
8ce3d9b
4997aeb
 
9eb3e78
11bc07e
9eb3e78
dcca063
 
9eb3e78
 
 
 
 
79497a3
c9dd21c
9eb3e78
2376b2f
64523d8
 
 
 
 
9eb3e78
 
 
 
 
 
c9dd21c
9eb3e78
 
 
 
 
c9dd21c
9eb3e78
 
f76455a
64523d8
9eb3e78
 
a1f90e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abef2ac
a1f90e6
 
 
 
8ce3d9b
9eb3e78
 
5d2299c
9eb3e78
8ce3d9b
 
 
 
dcb00f7
 
3d67d69
dcb00f7
a1f90e6
a969331
 
 
a1f90e6
a969331
c2e5bed
a969331
 
a1f90e6
a969331
 
 
 
909aec0
64523d8
e488916
a1f90e6
544f3f0
150b8d9
909aec0
 
 
a969331
909aec0
 
6725ef7
8ce3d9b
 
 
 
 
23247c4
8ce3d9b
 
 
 
9eb3e78
 
 
dcca063
 
 
9eb3e78
dcca063
 
9eb3e78
dcca063
 
 
940c185
8a3a5d7
940c185
 
 
 
f06193e
 
 
 
 
80682f3
f06193e
80682f3
f06193e
 
ec7fc78
 
609ef88
 
 
 
 
 
 
 
ec7fc78
609ef88
ec7fc78
 
 
 
6e57cc9
d34a703
792de2f
 
7a0f54c
 
 
 
 
a1f90e6
792de2f
61c3232
ec7fc78
7d43644
 
 
 
 
39dae03
 
7d43644
 
 
 
 
 
940c185
 
609ef88
ec7fc78
 
 
 
6e57cc9
a1f90e6

import streamlit as st
import os
from streamlit_chat import message
import numpy as np
import pandas as pd
from io import StringIO
import PyPDF2
from tqdm import tqdm
import math
from transformers import pipeline
# import json

# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")

# from datasets import load_dataset

# dataset = load_dataset("wikipedia", "20220301.en", split="train[240000:250000]")


# wikidata = []

# for record in dataset:
#     wikidata.append(record["text"])

# wikidata = list(set(wikidata))
# # print("\n".join(wikidata[:5]))
# # print(len(wikidata))

from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device != 'cuda':
    st.markdown(f"you are using {device}. This is much slower than using "
    "a CUDA-enabled GPU. If on colab you can change this by "
    "clicking Runtime > change runtime type > GPU.")

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
st.divider()

# Creating a Index(Pinecone Vector Database)
import os
# import pinecone

from pinecone.grpc import PineconeGRPC


PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")

# pc = PineconeGRPC( api_key=os.environ.get("PINECONE_API_KEY") ) # Now do stuff if 'my_index' not in pc.list_indexes().names(): pc.create_index( name='my_index', dimension=1536, metric='euclidean', spec=ServerlessSpec( cloud='aws', region='us-west-2' ) )

def connect_pinecone():
    pinecone = PineconeGRPC(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    # st.code(pinecone)
    # st.divider()
    # st.text(pinecone.list_indexes().names())
    # st.divider()
    # st.text(f"Succesfully connected to the pinecone")
    return pinecone

def get_pinecone_semantic_index(pinecone):
    index_name = "sematic-search"

    # only create if it deosnot exists
    if index_name not in pinecone.list_indexes().names():
        pinecone.create_index(
            name=index_name,
            description="Semantic search",
            dimension=model.get_sentence_embedding_dimension(),
            metric="cosine",
            spec=ServerlessSpec( cloud='gcp', region='us-central1' )
        )
    # now connect to index
    index = pinecone.Index(index_name)
    # st.text(f"Succesfully connected to the pinecone index")
    return index

def promt_engineer(text):
    promt_template = """
    write a concise summary of the following text delimited by triple backquotes.
    return your response in bullet points which convers the key points of the text.

    ```{text}```

    BULLET POINT SUMMARY:
    """
    # Load the summarization pipeline with the specified model
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    # Generate the prompt
    prompt = prompt_template.format(text=text)

    # Generate the summary
    summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
    
    with st.sidebar:
        st.write(summary)

def chat_actions():
    
    pinecone = connect_pinecone()
    index = get_pinecone_semantic_index(pinecone)

    st.session_state["chat_history"].append(
        {"role": "user", "content": st.session_state["chat_input"]},
    )

    query_embedding = model.encode(st.session_state["chat_input"])
    # create the query vector
    query_vector = query_embedding.tolist()
    # now query vector database
    result = index.query(query_vector, top_k=5, include_metadata=True)  # result is a list of tuples

    # Create a list of lists
    data = []
    consolidated_text = ""
    i = 0
    for res in result['matches']:
        i = i + 1
        data.append([f"{i}⭐", res['score'], res['metadata']['text']])
        consolidated_text.append(f"{res['metadata']['text']}\n\n")

    # Create a DataFrame from the list of lists
    resdf = pd.DataFrame(data, columns=['TopRank', 'Score', 'Text'])

    with st.sidebar:
        st.markdown("*:red[semantic search results]* with **:green[Retrieval Augmented Generation]** ***(RAG)***.")
        st.dataframe(resdf)
        promt_engineer(consolidated_text)

    for res in result['matches']:
        st.session_state["chat_history"].append(
            {
                "role": "assistant",
                "content": f"{res['metadata']['text']}",
            },  # This can be replaced with your chat response logic
        )
        break;

if "chat_history" not in st.session_state:
    st.session_state["chat_history"] = []


st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")

for i in st.session_state["chat_history"]:
    with st.chat_message(name=i["role"]):
        st.write(i["content"])

### Creating a Index(Pinecone Vector Database)
# %%writefile .env
# PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
# PINECONE_ENV=os.getenv("PINECONE_ENV")
# PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")

# import os
# import pinecone

# from pinecone import Index, GRPCIndex
# pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
# st.text(pinecone)

def print_out(pages):
    for i in range(len(pages)):
        text = pages[i].extract_text().strip()
        st.write(f"Page {i} : {text}")

def combine_text(pages):
    concatenates_text = ""
    for page in tqdm(pages):
        text = page.extract_text().strip()
        concatenates_text += text
    bytesize = concatenates_text.encode("utf-8")
    p = math.pow(1024, 2)
    mbsize = round(len(bytesize) / p, 2)
    st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")

def create_embeddings():
    # Get the uploaded file
    with st.sidebar:
        uploaded_files = st.session_state["uploaded_files"]
        for uploaded_file in uploaded_files:
            # Read the contents of the file
            reader = PyPDF2.PdfReader(uploaded_file)
            pages = reader.pages
            print_out(pages)
            combine_text(pages)

        st.write("created_embeddings")

    # Display the contents of the file
    # st.write(file_contents)


with st.sidebar:
    st.markdown("""
    ***Follow this steps***
    - upload pdf file to create embeddings using model on your own docs
    - wait see success message on embeddings creation 
    - It Takes couple of mins after upload the pdf
    - Now Chat with model to get the summarized info 
    - Generate Promted reponses on the upload pdf
    - Provides summarized results and QA's using GPT models
    """)
    uploaded_files = st.file_uploader('Choose your .pdf file', type="pdf", accept_multiple_files=True, key="uploaded_files", on_change=create_embeddings)
    # for uploaded_file in uploaded_files:
        # To read file as bytes:
        # bytes_data = uploaded_file.getvalue()
        # st.write(bytes_data)

        # To convert to a string based IO:
        # stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        # st.write(stringio)

        # To read file as string:
        # string_data = stringio.read()
        # st.write(string_data)

        # Can be used wherever a "file-like" object is accepted:
        # dataframe = pd.read_csv(uploaded_file)
        # st.write(dataframe)

        # reader = PyPDF2.PdfReader(uploaded_file)
        # pages = reader.pages
        # print_out(pages)
        # combine_text(pages)
        # promt_engineer(text)