NEXAS commited on
Commit
91c7e66
·
verified ·
1 Parent(s): 40da23c

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pdf_resource/BN81-25561C-300_EUG_ROPDVBEUD_EU_ENG_240507.0.pdf filter=lfs diff=lfs merge=lfs -text
37
+ pdf_resource/D3300_NT(En)02.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from src.utils.ingest_text import create_vector_database
4
+ from src.utils.ingest_image import extract_and_store_images
5
+ from src.utils.text_qa import qa_bot
6
+ from src.utils.image_qa import query_and_print_results
7
+ import nest_asyncio
8
+ nest_asyncio.apply()
9
+
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+
13
+ def get_answer(query,chain):
14
+ response = chain.invoke(query)
15
+ return response['result']
16
+
17
+ st.title("MULTIMODAL DOC QA")
18
+ uploaded_file = st.file_uploader("File upload",type="pdf")
19
+ if uploaded_file is not None:
20
+ # Save the uploaded file to a temporary location
21
+ with open(uploaded_file.name, "wb") as f:
22
+ f.write(uploaded_file.getbuffer())
23
+
24
+ # Get the absolute path of the saved file
25
+ path = os.path.abspath(uploaded_file.name)
26
+ st.write(f"File saved to: {path}")
27
+ print(path)
28
+
29
+ st.write("Document uploaded successfuly!")
30
+
31
+
32
+ if st.button("Start Processing"):
33
+ with st.spinner("Processing"):
34
+ client = create_vector_database(path)
35
+ image_vdb = extract_and_store_images(path)
36
+ chain = qa_bot(client)
37
+
38
+
39
+ if user_input := st.chat_input("User Input"):
40
+ with st.chat_message("user"):
41
+ st.markdown(user_input)
42
+
43
+ with st.spinner("Generating Response..."):
44
+ response = get_answer(chain,user_input)
45
+ answer = response['result']
46
+ st.markdown(answer)
47
+ query_and_print_results(image_vdb,user_input)
48
+
data/output.md ADDED
The diff for this file is too large to render. See raw diff
 
data/parsed_data.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfd4afa8be2cd891a38e907116138938acf6ad75db94020d878e6d6725b31c12
3
+ size 125666
pdf_resource/BN81-25561C-300_EUG_ROPDVBEUD_EU_ENG_240507.0.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be1cf9a6367dcb45d25e0674217193c0c1c1fbe4f284432febe721fb68b05f91
3
+ size 4355285
pdf_resource/D3300_NT(En)02.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d46bf0dc31ea22547354e845c58d22501fece0e6c5a78e5684e88f3806559d0
3
+ size 5830738
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.1.15
2
+ langchain-community==0.0.34
3
+ langchain-experimental==0.0.57
4
+ langchain-google-genai==1.0.1
5
+ langchain-openai==0.0.7
6
+ langchain-groq==0.0.1
7
+ langchain-text-splitters==0.0.1
8
+ langchainhub==0.1.15
9
+ fastembed==0.2.5
10
+ llama-index-core==0.10.25.post1
11
+ llama-parse==0.4.0
12
+ llamaindex-py-client==0.1.15
13
+ qdrant-client==1.8.2
14
+ streamlit==1.33.0
15
+ streamlit-float==0.3.2
16
+ unstructured==0.12.5
17
+ unstructured-client==0.18.0
18
+ pypdf==4.3.0
19
+ python-docx==1.1.2
20
+ PyMuPDF==1.24.7
21
+ chromadb
22
+ langchain-core
23
+ open-clip-torch
24
+ langchain_chroma
25
+ qdrant_client
26
+ fastembed
src/utils/image_qa.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import BytesIO
3
+ from IPython.display import Image, display
4
+ from PIL import Image as PILImage
5
+
6
+ def query_and_print_results(image_vdb,query):
7
+ results=3
8
+ # Query the database
9
+ query_results = image_vdb.query(
10
+ query_texts=[query],
11
+ n_results=results,
12
+ include=['uris', 'distances']
13
+ )
14
+
15
+ # Print the results
16
+ for idx, uri in enumerate(query_results['uris'][0]):
17
+ img = Image(filename=uri, width=300)
18
+ st.image(img) # type: ignore
19
+
20
+ # Testing it out
src/utils/ingest_image.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz
3
+ import chromadb
4
+ from chromadb.utils.data_loaders import ImageLoader
5
+ from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction # type: ignore
6
+
7
+ def extract_and_store_images(pdf_path, db_path='image_vdb', images_dir='extracted_images'):
8
+ # Step 1: Extract images from PDF
9
+ pdf_document = fitz.open(pdf_path)
10
+ os.makedirs(images_dir, exist_ok=True)
11
+
12
+ for page_num in range(len(pdf_document)):
13
+ page = pdf_document.load_page(page_num)
14
+ image_list = page.get_images(full=True)
15
+
16
+ for image_index, img in enumerate(image_list):
17
+ xref = img[0]
18
+ base_image = pdf_document.extract_image(xref)
19
+ image_bytes = base_image["image"]
20
+ image_ext = base_image["ext"]
21
+ image_filename = f"{images_dir}/page_{page_num+1}_img_{image_index+1}.{image_ext}"
22
+
23
+ with open(image_filename, "wb") as image_file:
24
+ image_file.write(image_bytes)
25
+ print(f"Saved: {image_filename}")
26
+
27
+ print("Image extraction complete.")
28
+
29
+ # Step 2: Add extracted images to ChromaDB
30
+ chroma_client = chromadb.PersistentClient(path=db_path)
31
+ image_loader = ImageLoader()
32
+ CLIP = OpenCLIPEmbeddingFunction()
33
+ image_vdb = chroma_client.get_or_create_collection(name="image", embedding_function=CLIP, data_loader=image_loader)
34
+
35
+ ids = []
36
+ uris = []
37
+
38
+ for i, filename in enumerate(sorted(os.listdir(images_dir))):
39
+ if filename.endswith('.jpeg') or filename.endswith('.png'):
40
+ file_path = os.path.join(images_dir, filename)
41
+ ids.append(str(i))
42
+ uris.append(file_path)
43
+
44
+ image_vdb.add(ids=ids, uris=uris)
45
+ print("Images added to the database.")
46
+
47
+ return image_vdb
48
+
49
+ # Example usage
src/utils/ingest_text.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_parse import LlamaParse
2
+ from langchain_chroma import Chroma
3
+ from qdrant_client import QdrantClient
4
+ from langchain_community.vectorstores.qdrant import Qdrant
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
7
+ from langchain_community.document_loaders.directory import DirectoryLoader
8
+ import os
9
+ from fastembed import TextEmbedding
10
+ from typing import List
11
+
12
+ import nest_asyncio
13
+ nest_asyncio.apply()
14
+
15
+ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
16
+ #qdrant_url = os.getenv("QDRANT_URL ")
17
+ #qdrant_api_key = os.getenv("QDRANT_API_KEY")
18
+ groq_api_key = os.getenv("GROQ_API_KEY")
19
+
20
+
21
+ parsed_data_file = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data\parsed_data.pkl"
22
+ output_md = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data\output.md"
23
+ loki = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data"
24
+
25
+ import pickle
26
+ # Define a function to load parsed data if available, or parse if not
27
+ def load_or_parse_data(loc):
28
+ data_file = parsed_data_file
29
+
30
+ if os.path.exists(data_file):
31
+ # Load the parsed data from the file
32
+ with open(data_file, "rb") as f:
33
+ parsed_data = pickle.load(f)
34
+ else:
35
+ # Perform the parsing step and store the result in llama_parse_documents
36
+ parsingInstructiontest10k = """The provided document is an user guide or a manual.
37
+ It contains many images and tables.
38
+ Try to be precise while answering the questions"""
39
+ parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
40
+ llama_parse_documents = parser.load_data(loc)
41
+
42
+
43
+ # Save the parsed data to a file
44
+ with open(data_file, "wb") as f:
45
+ pickle.dump(llama_parse_documents, f)
46
+
47
+ # Set the parsed data to the variable
48
+ parsed_data = llama_parse_documents
49
+
50
+ return parsed_data
51
+
52
+
53
+ # Create vector database
54
+ def create_vector_database(loc):
55
+ """
56
+ Creates a vector database using document loaders and embeddings.
57
+
58
+ This function loads urls,
59
+ splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
60
+ and finally persists the embeddings into a Chroma vector database.
61
+
62
+ """
63
+ # Call the function to either load or parse the data
64
+ llama_parse_documents = load_or_parse_data(loc)
65
+ #print(llama_parse_documents[1].text[:100])
66
+
67
+ #with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
68
+ # for doc in llama_parse_documents:
69
+ # f.write(doc.text + '\n')
70
+ with open(output_md,'a', encoding='utf-8') as f: # Open the file in append mode ('a')
71
+ for doc in llama_parse_documents:
72
+ f.write(doc.text + '\n')
73
+
74
+ loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
75
+ documents = loader.load()
76
+ # Split loaded documents into chunks
77
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
78
+ print('data chunckex')
79
+ docs = text_splitter.split_documents(documents)
80
+ print(len(docs))
81
+
82
+ #len(docs)
83
+ #docs[0]
84
+
85
+ # Initialize Embeddings
86
+ embeddings = FastEmbedEmbeddings() # type: ignore
87
+ #embeddings = TextEmbedding()
88
+
89
+ print('Vector DB started!')
90
+
91
+ # Create and persist a Chroma vector database from the chunked documents
92
+ qdrant = Qdrant.from_documents(
93
+ documents=docs,
94
+ embedding=embeddings,
95
+ path="local_qdrant",
96
+ #url=qdrant_url,
97
+ collection_name="rag"
98
+ #api_key=qdrant_api_key
99
+ )
100
+ # save to disk
101
+ #db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
102
+ #docs = db2.similarity_search(query)
103
+
104
+ # load from disk
105
+ #db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
106
+
107
+ #query it
108
+ #query = "what is the agend of Financial Statements for 2022 ?"
109
+ #found_doc = qdrant.similarity_search(query, k=3)
110
+ #print(found_doc[0][:100])
111
+ #
112
+ print('Vector DB created successfully !')
113
+ #query = "Switching between external devices connected to the TV"
114
+ #found_doc = qdrant.similarity_search(query, k=3)
115
+ #print(found_doc)
116
+ return qdrant
src/utils/text_qa.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from langchain_groq import ChatGroq
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain_community.vectorstores.qdrant import Qdrant
6
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
7
+ from qdrant_client import QdrantClient
8
+ #from langchain_community.chat_models import ChatOllama
9
+
10
+
11
+ #import chainlit as cl
12
+ from langchain.chains import RetrievalQA
13
+
14
+ # bring in our GROQ_API_KEY
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+
19
+ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
20
+ qdrant_url = os.getenv("QDRANT_URL ")
21
+ qdrant_api_key = os.getenv("QDRANT_API_KEY")
22
+ groq_api_key = os.getenv("GROQ_API_KEY")
23
+
24
+ custom_prompt_template = """Use the following pieces of information to answer the user's question.
25
+ If you don't know the answer, just say that you don't know,if it is out of context say that it is out of context and also try to provide the answer and don't be rude.
26
+
27
+ Context: {context}
28
+ Question: {question}
29
+
30
+ Only return the helpful answer below and nothing else.
31
+ Helpful answer:
32
+ """
33
+
34
+ def set_custom_prompt():
35
+ """
36
+ Prompt template for QA retrieval for each vectorstore
37
+ """
38
+ prompt = PromptTemplate(template=custom_prompt_template,
39
+ input_variables=['context', 'question'])
40
+ return prompt
41
+
42
+
43
+ chat_model = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768",api_key=groq_api_key) # type: ignore
44
+ #chat_model = ChatGroq(temperature=0, model_name="Llama2-70b-4096")
45
+ #chat_model = ChatOllama(model="llama2", request_timeout=30.0)
46
+
47
+ #client = QdrantClient(api_key=qdrant_api_key, url=qdrant_url,)
48
+
49
+
50
+ def retrieval_qa_chain(llm, prompt, vectorstore):
51
+ qa_chain = RetrievalQA.from_chain_type(
52
+ llm=llm,
53
+ chain_type="stuff",
54
+ retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
55
+ return_source_documents=True,
56
+ chain_type_kwargs={'prompt': prompt}
57
+ )
58
+ return qa_chain
59
+
60
+
61
+ def qa_bot(qdrant):
62
+ embeddings = FastEmbedEmbeddings() # type: ignore
63
+ vectorstore = qdrant
64
+ llm = chat_model
65
+ qa_prompt=set_custom_prompt()
66
+ qa = retrieval_qa_chain(llm, qa_prompt, vectorstore)
67
+ return qa
68
+
69
+ #---------------------------------------------------------------------#
70
+
71
+ #qdrant_cloud_api_key="your_qdrant_cloud_api_key"
72
+ #qdrant_url="your_qdrant_url"
73
+
74
+ #qdrant_cloud = Qdrant.from_documents(
75
+ # docs,
76
+ # embeddings,
77
+ # url=qdrant_url,
78
+ # prefer_grpc=True,
79
+ # api_key=qdrant_cloud_api_key,
80
+ # collection_name="qdrant_cloud_documents",
81
+ #)
82
+
83
+ #---------------------------------------------------------------------#
84
+ #query="how to make coffee"
85
+ #print(query)
86
+
87
+ #chain = qa_bot()
88
+ #response = chain.invoke(query,)
89
+
90
+ #print(response['result'])
91
+