Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- .gitattributes +2 -0
- app.py +48 -0
- data/output.md +0 -0
- data/parsed_data.pkl +3 -0
- pdf_resource/BN81-25561C-300_EUG_ROPDVBEUD_EU_ENG_240507.0.pdf +3 -0
- pdf_resource/D3300_NT(En)02.pdf +3 -0
- requirements.txt +26 -0
- src/utils/image_qa.py +20 -0
- src/utils/ingest_image.py +49 -0
- src/utils/ingest_text.py +116 -0
- src/utils/text_qa.py +91 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
pdf_resource/BN81-25561C-300_EUG_ROPDVBEUD_EU_ENG_240507.0.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
pdf_resource/D3300_NT(En)02.pdf filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from src.utils.ingest_text import create_vector_database
|
4 |
+
from src.utils.ingest_image import extract_and_store_images
|
5 |
+
from src.utils.text_qa import qa_bot
|
6 |
+
from src.utils.image_qa import query_and_print_results
|
7 |
+
import nest_asyncio
|
8 |
+
nest_asyncio.apply()
|
9 |
+
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
def get_answer(query,chain):
|
14 |
+
response = chain.invoke(query)
|
15 |
+
return response['result']
|
16 |
+
|
17 |
+
st.title("MULTIMODAL DOC QA")
|
18 |
+
uploaded_file = st.file_uploader("File upload",type="pdf")
|
19 |
+
if uploaded_file is not None:
|
20 |
+
# Save the uploaded file to a temporary location
|
21 |
+
with open(uploaded_file.name, "wb") as f:
|
22 |
+
f.write(uploaded_file.getbuffer())
|
23 |
+
|
24 |
+
# Get the absolute path of the saved file
|
25 |
+
path = os.path.abspath(uploaded_file.name)
|
26 |
+
st.write(f"File saved to: {path}")
|
27 |
+
print(path)
|
28 |
+
|
29 |
+
st.write("Document uploaded successfuly!")
|
30 |
+
|
31 |
+
|
32 |
+
if st.button("Start Processing"):
|
33 |
+
with st.spinner("Processing"):
|
34 |
+
client = create_vector_database(path)
|
35 |
+
image_vdb = extract_and_store_images(path)
|
36 |
+
chain = qa_bot(client)
|
37 |
+
|
38 |
+
|
39 |
+
if user_input := st.chat_input("User Input"):
|
40 |
+
with st.chat_message("user"):
|
41 |
+
st.markdown(user_input)
|
42 |
+
|
43 |
+
with st.spinner("Generating Response..."):
|
44 |
+
response = get_answer(chain,user_input)
|
45 |
+
answer = response['result']
|
46 |
+
st.markdown(answer)
|
47 |
+
query_and_print_results(image_vdb,user_input)
|
48 |
+
|
data/output.md
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/parsed_data.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfd4afa8be2cd891a38e907116138938acf6ad75db94020d878e6d6725b31c12
|
3 |
+
size 125666
|
pdf_resource/BN81-25561C-300_EUG_ROPDVBEUD_EU_ENG_240507.0.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be1cf9a6367dcb45d25e0674217193c0c1c1fbe4f284432febe721fb68b05f91
|
3 |
+
size 4355285
|
pdf_resource/D3300_NT(En)02.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d46bf0dc31ea22547354e845c58d22501fece0e6c5a78e5684e88f3806559d0
|
3 |
+
size 5830738
|
requirements.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.1.15
|
2 |
+
langchain-community==0.0.34
|
3 |
+
langchain-experimental==0.0.57
|
4 |
+
langchain-google-genai==1.0.1
|
5 |
+
langchain-openai==0.0.7
|
6 |
+
langchain-groq==0.0.1
|
7 |
+
langchain-text-splitters==0.0.1
|
8 |
+
langchainhub==0.1.15
|
9 |
+
fastembed==0.2.5
|
10 |
+
llama-index-core==0.10.25.post1
|
11 |
+
llama-parse==0.4.0
|
12 |
+
llamaindex-py-client==0.1.15
|
13 |
+
qdrant-client==1.8.2
|
14 |
+
streamlit==1.33.0
|
15 |
+
streamlit-float==0.3.2
|
16 |
+
unstructured==0.12.5
|
17 |
+
unstructured-client==0.18.0
|
18 |
+
pypdf==4.3.0
|
19 |
+
python-docx==1.1.2
|
20 |
+
PyMuPDF==1.24.7
|
21 |
+
chromadb
|
22 |
+
langchain-core
|
23 |
+
open-clip-torch
|
24 |
+
langchain_chroma
|
25 |
+
qdrant_client
|
26 |
+
fastembed
|
src/utils/image_qa.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from io import BytesIO
|
3 |
+
from IPython.display import Image, display
|
4 |
+
from PIL import Image as PILImage
|
5 |
+
|
6 |
+
def query_and_print_results(image_vdb,query):
|
7 |
+
results=3
|
8 |
+
# Query the database
|
9 |
+
query_results = image_vdb.query(
|
10 |
+
query_texts=[query],
|
11 |
+
n_results=results,
|
12 |
+
include=['uris', 'distances']
|
13 |
+
)
|
14 |
+
|
15 |
+
# Print the results
|
16 |
+
for idx, uri in enumerate(query_results['uris'][0]):
|
17 |
+
img = Image(filename=uri, width=300)
|
18 |
+
st.image(img) # type: ignore
|
19 |
+
|
20 |
+
# Testing it out
|
src/utils/ingest_image.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import fitz
|
3 |
+
import chromadb
|
4 |
+
from chromadb.utils.data_loaders import ImageLoader
|
5 |
+
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction # type: ignore
|
6 |
+
|
7 |
+
def extract_and_store_images(pdf_path, db_path='image_vdb', images_dir='extracted_images'):
|
8 |
+
# Step 1: Extract images from PDF
|
9 |
+
pdf_document = fitz.open(pdf_path)
|
10 |
+
os.makedirs(images_dir, exist_ok=True)
|
11 |
+
|
12 |
+
for page_num in range(len(pdf_document)):
|
13 |
+
page = pdf_document.load_page(page_num)
|
14 |
+
image_list = page.get_images(full=True)
|
15 |
+
|
16 |
+
for image_index, img in enumerate(image_list):
|
17 |
+
xref = img[0]
|
18 |
+
base_image = pdf_document.extract_image(xref)
|
19 |
+
image_bytes = base_image["image"]
|
20 |
+
image_ext = base_image["ext"]
|
21 |
+
image_filename = f"{images_dir}/page_{page_num+1}_img_{image_index+1}.{image_ext}"
|
22 |
+
|
23 |
+
with open(image_filename, "wb") as image_file:
|
24 |
+
image_file.write(image_bytes)
|
25 |
+
print(f"Saved: {image_filename}")
|
26 |
+
|
27 |
+
print("Image extraction complete.")
|
28 |
+
|
29 |
+
# Step 2: Add extracted images to ChromaDB
|
30 |
+
chroma_client = chromadb.PersistentClient(path=db_path)
|
31 |
+
image_loader = ImageLoader()
|
32 |
+
CLIP = OpenCLIPEmbeddingFunction()
|
33 |
+
image_vdb = chroma_client.get_or_create_collection(name="image", embedding_function=CLIP, data_loader=image_loader)
|
34 |
+
|
35 |
+
ids = []
|
36 |
+
uris = []
|
37 |
+
|
38 |
+
for i, filename in enumerate(sorted(os.listdir(images_dir))):
|
39 |
+
if filename.endswith('.jpeg') or filename.endswith('.png'):
|
40 |
+
file_path = os.path.join(images_dir, filename)
|
41 |
+
ids.append(str(i))
|
42 |
+
uris.append(file_path)
|
43 |
+
|
44 |
+
image_vdb.add(ids=ids, uris=uris)
|
45 |
+
print("Images added to the database.")
|
46 |
+
|
47 |
+
return image_vdb
|
48 |
+
|
49 |
+
# Example usage
|
src/utils/ingest_text.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llama_parse import LlamaParse
|
2 |
+
from langchain_chroma import Chroma
|
3 |
+
from qdrant_client import QdrantClient
|
4 |
+
from langchain_community.vectorstores.qdrant import Qdrant
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
7 |
+
from langchain_community.document_loaders.directory import DirectoryLoader
|
8 |
+
import os
|
9 |
+
from fastembed import TextEmbedding
|
10 |
+
from typing import List
|
11 |
+
|
12 |
+
import nest_asyncio
|
13 |
+
nest_asyncio.apply()
|
14 |
+
|
15 |
+
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
|
16 |
+
#qdrant_url = os.getenv("QDRANT_URL ")
|
17 |
+
#qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
18 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
19 |
+
|
20 |
+
|
21 |
+
parsed_data_file = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data\parsed_data.pkl"
|
22 |
+
output_md = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data\output.md"
|
23 |
+
loki = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data"
|
24 |
+
|
25 |
+
import pickle
|
26 |
+
# Define a function to load parsed data if available, or parse if not
|
27 |
+
def load_or_parse_data(loc):
|
28 |
+
data_file = parsed_data_file
|
29 |
+
|
30 |
+
if os.path.exists(data_file):
|
31 |
+
# Load the parsed data from the file
|
32 |
+
with open(data_file, "rb") as f:
|
33 |
+
parsed_data = pickle.load(f)
|
34 |
+
else:
|
35 |
+
# Perform the parsing step and store the result in llama_parse_documents
|
36 |
+
parsingInstructiontest10k = """The provided document is an user guide or a manual.
|
37 |
+
It contains many images and tables.
|
38 |
+
Try to be precise while answering the questions"""
|
39 |
+
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
|
40 |
+
llama_parse_documents = parser.load_data(loc)
|
41 |
+
|
42 |
+
|
43 |
+
# Save the parsed data to a file
|
44 |
+
with open(data_file, "wb") as f:
|
45 |
+
pickle.dump(llama_parse_documents, f)
|
46 |
+
|
47 |
+
# Set the parsed data to the variable
|
48 |
+
parsed_data = llama_parse_documents
|
49 |
+
|
50 |
+
return parsed_data
|
51 |
+
|
52 |
+
|
53 |
+
# Create vector database
|
54 |
+
def create_vector_database(loc):
|
55 |
+
"""
|
56 |
+
Creates a vector database using document loaders and embeddings.
|
57 |
+
|
58 |
+
This function loads urls,
|
59 |
+
splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
|
60 |
+
and finally persists the embeddings into a Chroma vector database.
|
61 |
+
|
62 |
+
"""
|
63 |
+
# Call the function to either load or parse the data
|
64 |
+
llama_parse_documents = load_or_parse_data(loc)
|
65 |
+
#print(llama_parse_documents[1].text[:100])
|
66 |
+
|
67 |
+
#with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
|
68 |
+
# for doc in llama_parse_documents:
|
69 |
+
# f.write(doc.text + '\n')
|
70 |
+
with open(output_md,'a', encoding='utf-8') as f: # Open the file in append mode ('a')
|
71 |
+
for doc in llama_parse_documents:
|
72 |
+
f.write(doc.text + '\n')
|
73 |
+
|
74 |
+
loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
|
75 |
+
documents = loader.load()
|
76 |
+
# Split loaded documents into chunks
|
77 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
|
78 |
+
print('data chunckex')
|
79 |
+
docs = text_splitter.split_documents(documents)
|
80 |
+
print(len(docs))
|
81 |
+
|
82 |
+
#len(docs)
|
83 |
+
#docs[0]
|
84 |
+
|
85 |
+
# Initialize Embeddings
|
86 |
+
embeddings = FastEmbedEmbeddings() # type: ignore
|
87 |
+
#embeddings = TextEmbedding()
|
88 |
+
|
89 |
+
print('Vector DB started!')
|
90 |
+
|
91 |
+
# Create and persist a Chroma vector database from the chunked documents
|
92 |
+
qdrant = Qdrant.from_documents(
|
93 |
+
documents=docs,
|
94 |
+
embedding=embeddings,
|
95 |
+
path="local_qdrant",
|
96 |
+
#url=qdrant_url,
|
97 |
+
collection_name="rag"
|
98 |
+
#api_key=qdrant_api_key
|
99 |
+
)
|
100 |
+
# save to disk
|
101 |
+
#db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
|
102 |
+
#docs = db2.similarity_search(query)
|
103 |
+
|
104 |
+
# load from disk
|
105 |
+
#db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
|
106 |
+
|
107 |
+
#query it
|
108 |
+
#query = "what is the agend of Financial Statements for 2022 ?"
|
109 |
+
#found_doc = qdrant.similarity_search(query, k=3)
|
110 |
+
#print(found_doc[0][:100])
|
111 |
+
#
|
112 |
+
print('Vector DB created successfully !')
|
113 |
+
#query = "Switching between external devices connected to the TV"
|
114 |
+
#found_doc = qdrant.similarity_search(query, k=3)
|
115 |
+
#print(found_doc)
|
116 |
+
return qdrant
|
src/utils/text_qa.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
from langchain_groq import ChatGroq
|
4 |
+
from langchain.prompts import PromptTemplate
|
5 |
+
from langchain_community.vectorstores.qdrant import Qdrant
|
6 |
+
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
7 |
+
from qdrant_client import QdrantClient
|
8 |
+
#from langchain_community.chat_models import ChatOllama
|
9 |
+
|
10 |
+
|
11 |
+
#import chainlit as cl
|
12 |
+
from langchain.chains import RetrievalQA
|
13 |
+
|
14 |
+
# bring in our GROQ_API_KEY
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
|
19 |
+
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
|
20 |
+
qdrant_url = os.getenv("QDRANT_URL ")
|
21 |
+
qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
22 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
23 |
+
|
24 |
+
custom_prompt_template = """Use the following pieces of information to answer the user's question.
|
25 |
+
If you don't know the answer, just say that you don't know,if it is out of context say that it is out of context and also try to provide the answer and don't be rude.
|
26 |
+
|
27 |
+
Context: {context}
|
28 |
+
Question: {question}
|
29 |
+
|
30 |
+
Only return the helpful answer below and nothing else.
|
31 |
+
Helpful answer:
|
32 |
+
"""
|
33 |
+
|
34 |
+
def set_custom_prompt():
|
35 |
+
"""
|
36 |
+
Prompt template for QA retrieval for each vectorstore
|
37 |
+
"""
|
38 |
+
prompt = PromptTemplate(template=custom_prompt_template,
|
39 |
+
input_variables=['context', 'question'])
|
40 |
+
return prompt
|
41 |
+
|
42 |
+
|
43 |
+
chat_model = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768",api_key=groq_api_key) # type: ignore
|
44 |
+
#chat_model = ChatGroq(temperature=0, model_name="Llama2-70b-4096")
|
45 |
+
#chat_model = ChatOllama(model="llama2", request_timeout=30.0)
|
46 |
+
|
47 |
+
#client = QdrantClient(api_key=qdrant_api_key, url=qdrant_url,)
|
48 |
+
|
49 |
+
|
50 |
+
def retrieval_qa_chain(llm, prompt, vectorstore):
|
51 |
+
qa_chain = RetrievalQA.from_chain_type(
|
52 |
+
llm=llm,
|
53 |
+
chain_type="stuff",
|
54 |
+
retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
|
55 |
+
return_source_documents=True,
|
56 |
+
chain_type_kwargs={'prompt': prompt}
|
57 |
+
)
|
58 |
+
return qa_chain
|
59 |
+
|
60 |
+
|
61 |
+
def qa_bot(qdrant):
|
62 |
+
embeddings = FastEmbedEmbeddings() # type: ignore
|
63 |
+
vectorstore = qdrant
|
64 |
+
llm = chat_model
|
65 |
+
qa_prompt=set_custom_prompt()
|
66 |
+
qa = retrieval_qa_chain(llm, qa_prompt, vectorstore)
|
67 |
+
return qa
|
68 |
+
|
69 |
+
#---------------------------------------------------------------------#
|
70 |
+
|
71 |
+
#qdrant_cloud_api_key="your_qdrant_cloud_api_key"
|
72 |
+
#qdrant_url="your_qdrant_url"
|
73 |
+
|
74 |
+
#qdrant_cloud = Qdrant.from_documents(
|
75 |
+
# docs,
|
76 |
+
# embeddings,
|
77 |
+
# url=qdrant_url,
|
78 |
+
# prefer_grpc=True,
|
79 |
+
# api_key=qdrant_cloud_api_key,
|
80 |
+
# collection_name="qdrant_cloud_documents",
|
81 |
+
#)
|
82 |
+
|
83 |
+
#---------------------------------------------------------------------#
|
84 |
+
#query="how to make coffee"
|
85 |
+
#print(query)
|
86 |
+
|
87 |
+
#chain = qa_bot()
|
88 |
+
#response = chain.invoke(query,)
|
89 |
+
|
90 |
+
#print(response['result'])
|
91 |
+
|