pvyas96 commited on
Commit
7e55c3b
1 Parent(s): 0a1d6e9

Upload 2 files

Browse files
Files changed (2) hide show
  1. pages/app.py +92 -0
  2. pages/ingest.py +79 -0
pages/app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline
2
+ from langchain.llms import HuggingFaceHub, HuggingFacePipeline
3
+ from dotenv import load_dotenv
4
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
5
+ from langchain.vectorstores import Chroma
6
+ from langchain.chains import RetrievalQA
7
+ import textwrap
8
+ import torch
9
+ import os
10
+ import streamlit as st
11
+
12
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
13
+
14
+
15
+ def load_vector_store():
16
+ model_name = "BAAI/bge-small-en"
17
+ model_kwargs = {"device": device}
18
+ encode_kwargs = {"normalize_embeddings": True}
19
+ embeddings = HuggingFaceBgeEmbeddings(
20
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
21
+ )
22
+ print('Embeddings loaded!')
23
+ load_vector_store = Chroma(persist_directory = 'vector stores/textdb', embedding_function = embeddings)
24
+ print('Vector store loaded!')
25
+
26
+ retriever = load_vector_store.as_retriever(
27
+ search_kwargs = {"k" : 10},
28
+ )
29
+ return retriever
30
+
31
+
32
+ #model
33
+ def load_model():
34
+ repo_id = 'llmware/dragon-mistral-7b-v0'
35
+ llm = HuggingFaceHub(
36
+ repo_id = repo_id,
37
+ model_kwargs = {'max_new_tokens' : 100}
38
+ )
39
+ print(llm('HI!'))
40
+ return llm
41
+
42
+
43
+ def qa_chain():
44
+ retriever = load_vector_store()
45
+ llm = load_model()
46
+ qa = RetrievalQA.from_chain_type(
47
+ llm = llm,
48
+ chain_type = 'stuff',
49
+ retriever = retriever,
50
+ return_source_documents = True,
51
+ verbose = True
52
+ )
53
+ return qa
54
+
55
+ def wrap_text_preserve_newlines(text, width=110):
56
+ # Split the input text into lines based on newline characters
57
+ lines = text.split('\n')
58
+
59
+ # Wrap each line individually
60
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
61
+
62
+ # Join the wrapped lines back together using newline characters
63
+ wrapped_text = '\n'.join(wrapped_lines)
64
+
65
+ return wrapped_text
66
+
67
+ def process_llm_response(llm_response):
68
+ print(wrap_text_preserve_newlines(llm_response['result']))
69
+ print('\n\nSources:')
70
+ for source in llm_response["source_documents"]:
71
+ print(source.metadata['source'])
72
+
73
+ def main():
74
+ qa = qa_chain()
75
+ st.title('DOCUMENT-GPT')
76
+ text_query = st.text_area('Ask any question from your documents!')
77
+ generate_response_btn = st.button('Run RAG')
78
+
79
+ st.subheader('Response')
80
+ if generate_response_btn and text_query is not None:
81
+ with st.spinner('Generating Response. Please wait...'):
82
+ text_response = qa(f"<human>:" + text_query + "\n" + "<bot>:")
83
+ if text_response:
84
+ st.write(text_response["result"])
85
+ else:
86
+ st.error('Failed to get response')
87
+
88
+ if __name__ == "__main__":
89
+ hf_token = st.text_input("Paste Huggingface read api key")
90
+ if hf_token:
91
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
92
+ main()
pages/ingest.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #importing dependencies
2
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.document_loaders import PyPDFDirectoryLoader
6
+ from langchain.storage import LocalFileStore
7
+ import time
8
+ import torch
9
+ import streamlit as st
10
+ import tkinter as tk
11
+ from tkinter import filedialog
12
+ from pathlib import Path
13
+
14
+ def select_folder():
15
+ root = tk.Tk()
16
+ root.withdraw()
17
+ folder_path = filedialog.askdirectory(master=root)
18
+ root.destroy()
19
+ return folder_path
20
+
21
+ # check if CUDA is available and set the device
22
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23
+ print('Using device:', device)
24
+
25
+ store = LocalFileStore("../cache/")
26
+ #loading data
27
+ root = tk.Tk()
28
+ root.withdraw()
29
+
30
+ # Make folder picker dialog appear on top of other windows
31
+ root.wm_attributes('-topmost', 1)
32
+
33
+ # Folder picker button
34
+ st.title('Pick Pdfs Folder')
35
+ st.write('Please select a folder:')
36
+
37
+ dirname = ""
38
+ pdfs_folder = ""
39
+ clicked = st.button('Browse')
40
+ if clicked:
41
+ dirname = st.text_input('Selected folder:', filedialog.askdirectory(master=root))
42
+ pdfs_folder = Path(dirname)
43
+ if pdfs_folder:
44
+ st.write("Selected folder path:", pdfs_folder)
45
+ loader = PyPDFDirectoryLoader(pdfs_folder)
46
+ documents = loader.load()
47
+ st.write(len(documents))
48
+
49
+ #splitting
50
+
51
+ splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 10)
52
+ text_chunks = splitter.split_documents(documents)
53
+ st.write(len(text_chunks))
54
+
55
+ #loading HuggingFaceBGE embeddings
56
+ model_name = "BAAI/bge-small-en"
57
+ st.write("Loading tokenizer model", model_name)
58
+ model_kwargs = {"device": device}
59
+ encode_kwargs = {"normalize_embeddings": True}
60
+ embeddings = HuggingFaceBgeEmbeddings(
61
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
62
+ )
63
+
64
+ st.write('Embeddings loaded!')
65
+
66
+ # creating Documents vector database.
67
+
68
+ t1 = time.time()
69
+ persist_directory = 'dbname'
70
+ vectordb = Chroma.from_documents(
71
+ documents = text_chunks,
72
+ embedding = embeddings,
73
+ collection_metadata = {"hnsw:space": "cosine"},
74
+ persist_directory = persist_directory
75
+ )
76
+ t2 = time.time()
77
+ st.write('Time taken for building db : ', (t2 - t1))
78
+
79
+