Spaces:

fazni
/

Resume-filter-plus-QA-documents

Running

App Files Files Community

fazni commited on Nov 16, 2023

Commit

5ad9f7c

1 Parent(s): 836b34d

added google palm model

Browse files

Files changed (14) hide show

FindKeyword.py → Components/FindKeyword.py +9 -0
Components/GooglePalmChat.py +40 -0
Components/OpenAIChat.py +20 -0
PreprocessText.py → Components/PreprocessText.py +39 -1
Components/Vector_db.py +23 -0
Components/__pycache__/FindKeyword.cpython-310.pyc +0 -0
Components/__pycache__/GooglePalmChat.cpython-310.pyc +0 -0
Components/__pycache__/PreprocessText.cpython-310.pyc +0 -0
Components/__pycache__/Vector_db.cpython-310.pyc +0 -0
Components/__pycache__/model_Responce.cpython-310.pyc +0 -0
model_Responce.py → Components/model_Responce.py +1 -2
app.py +30 -110
packages.txt +1 -0
requirements.txt +3 -1

FindKeyword.py → Components/FindKeyword.py RENAMED Viewed

@@ -9,3 +9,12 @@ def FindKeyWords(keywords, text):
             return "Keyword not found in the Resume."
     return highlighted_text

             return "Keyword not found in the Resume."
     return highlighted_text
+def filter_keywords(all_text, keywords):
+    filtered_text = []
+    for item in all_text:
+        filename = item['filename']
+        text = item['text']
+        filtered_text_with_keywords = FindKeyWords(keywords, text)
+        filtered_text.append({"filename": filename, "text": filtered_text_with_keywords})
+    return filtered_text

Components/GooglePalmChat.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from dotenv import load_dotenv
+from langchain.llms import GooglePalm
+from langchain.chains import RetrievalQA
+from langchain.vectorstores import FAISS
+from langchain.prompts import PromptTemplate
+load_dotenv()  # take environment variables from .env (especially openai api key)
+# Create Google Palm LLM model
+llm = GooglePalm(google_api_key=os.environ["GOOGLE_PALM_API"], temperature=0.1)
+vectordb_file_path = "faiss_index_V2"
+def get_qa_chain(embeddings):
+    # Load the vector database from the local folder
+    vectordb = FAISS.load_local(vectordb_file_path, embeddings)
+    # Create a retriever for querying the vector database
+    retriever = vectordb.as_retriever(score_threshold=0.7)
+    prompt_template = """Given the following context and a question, generate an answer based on this context only.
+    In the answer try to provide as much text as possible from the source document context without making much changes.
+    If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.
+    CONTEXT: {context}
+    QUESTION: {question}"""
+    PROMPT = PromptTemplate(
+        template=prompt_template, input_variables=["context", "question"]
+    )
+    chain = RetrievalQA.from_chain_type(llm=llm,
+                                        chain_type="stuff",
+                                        retriever=retriever,
+                                        input_key="query",
+                                        return_source_documents=True,
+                                        chain_type_kwargs={"prompt": PROMPT})
+    return chain

Components/OpenAIChat.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# def handle_user_input(question):
+#     response = st.session_state.conversation({'question':question})
+#     st.session_state.chat_history = response('chat_history')
+#     for i,message in enumerate(st.session_state.chat_history):
+#         if i % 2 == 0:
+#             st.write(user_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
+#         else:
+#             st.write(bot_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
+# def get_conversation_chain(vector_store):
+#     llm = ChatOpenAI()
+#     memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
+#     conversation_chain = ConversationalRetrievalChain.from_llm(
+#         llm=llm,
+#         retriever=vector_store.as_retriever(),
+#         memory = memory
+#     )
+#     return conversation_chain

PreprocessText.py → Components/PreprocessText.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import re
 def preprocess_text(text):
     # Remove newlines and tabs
@@ -25,4 +26,41 @@ def preprocess_text(text):
     # Add a space before a word containing a capital letter in the middle
     text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
-    return text

 import re
+from PyPDF2 import PdfReader
 def preprocess_text(text):
     # Remove newlines and tabs
     # Add a space before a word containing a capital letter in the middle
     text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
+    return text
+def get_pdf_text(pdfs,preprocess=True):
+    if preprocess:
+        all_text = []
+        for pdf in pdfs:
+            # Process each uploaded PDF file
+            # Reading PDF
+            pdf_reader = PdfReader(pdf)
+            # Get the filename of the PDF
+            filename = pdf.name
+            text = ""
+            # Reading Each Page
+            for page in pdf_reader.pages:
+                # Extracting Text in Every Page
+                text += page.extract_text()
+            # Preprocess the text
+            text = preprocess_text(text)
+            # Appending to array
+            all_text.append({"filename": filename, "text": text})
+        return all_text
+    else:
+        text = ""
+        for pdf in pdfs:
+            # Process each uploaded PDF file
+            # Reading PDF
+            pdf_reader = PdfReader(pdf)
+            # Reading Each Page
+            for page in pdf_reader.pages:
+                # Extracting Text in Every Page
+                text += page.extract_text()
+        # text = preprocess_text(text)
+        return text

Components/Vector_db.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import streamlit as st
+from langchain.vectorstores import FAISS
+# from langchain.chat_models import ChatOpenAI
+# from langchain.memory import ConversationBufferMemory
+# from langchain.chains import ConversationalRetrievalChain
+# Assuming this function encodes the question into a vector representation
+def encode_question(question,embeddings):
+    # embeddings = HuggingFaceInstructEmbeddings()  # Instantiate the embeddings model
+    question_vector = embeddings.embed_query(question)  # Encode the question into a vector
+    return question_vector
+def save_vector_store(text_chunks,embeddings):
+    # embeddings = OpenAIEmbeddings()
+    # model = INSTRUCTOR('hkunlp/instructor-base')
+    # embeddings = model.encode(raw_text)
+    # embeddings = HuggingFaceInstructEmbeddings()
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    new_db = FAISS.load_local("faiss_index_V2", embeddings)
+    new_db.merge_from(vectorstore)
+    new_db.save_local('faiss_index_V2')
+    return st.write("vector Store is Saved")

Components/__pycache__/FindKeyword.cpython-310.pyc ADDED Viewed

Binary file (851 Bytes). View file

Components/__pycache__/GooglePalmChat.cpython-310.pyc ADDED Viewed

Binary file (1.45 kB). View file

Components/__pycache__/PreprocessText.cpython-310.pyc ADDED Viewed

Binary file (1.03 kB). View file

Components/__pycache__/Vector_db.cpython-310.pyc ADDED Viewed

Binary file (735 Bytes). View file

Components/__pycache__/model_Responce.cpython-310.pyc ADDED Viewed

Binary file (1.21 kB). View file

model_Responce.py → Components/model_Responce.py RENAMED Viewed

@@ -7,14 +7,13 @@ from keras.preprocessing.text import Tokenizer
 # Load the model from the pickle file
 # filename = 'F:/CVFilter/models/model_pk.pkl'
 # with open(filename, 'rb') as file:
 #     model = pickle.load(file)
 # Load the saved model
 # model = joblib.load('F:\CVFilter\models\model.joblib')
-model = tf.keras.models.load_model('models/model.h5')
 tokenfile = 'tokenized_words/tokenized_words.pkl'
 # Load the tokenized words from the pickle file

 # Load the model from the pickle file
 # filename = 'F:/CVFilter/models/model_pk.pkl'
 # with open(filename, 'rb') as file:
 #     model = pickle.load(file)
 # Load the saved model
 # model = joblib.load('F:\CVFilter\models\model.joblib')
+model = tf.keras.models.load_model('models\model.h5')
 tokenfile = 'tokenized_words/tokenized_words.pkl'
 # Load the tokenized words from the pickle file

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
-import re
 import streamlit as st
-from PyPDF2 import PdfReader
 from dotenv import load_dotenv
-from FindKeyword import FindKeyWords
-from PreprocessText import preprocess_text
-from model_Responce import model_prediction
 from streamlit_extras.add_vertical_space import add_vertical_space
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
-# from langchain.chat_models import ChatOpenAI
-# from langchain.memory import ConversationBufferMemory
-# from langchain.chains import ConversationalRetrievalChain
 from htmlTemplates import css, bot_template, user_template
 from InstructorEmbedding import INSTRUCTOR
 import numpy as np
@@ -27,44 +24,6 @@ def get_text_chunks(text):
     chunks = text_splitter.split_text(text)
     return chunks
-# Assuming this function encodes the question into a vector representation
-def encode_question(question):
-    embeddings = HuggingFaceInstructEmbeddings()  # Instantiate the embeddings model
-    question_vector = embeddings.embed_query(question)  # Encode the question into a vector
-    return question_vector
-# def handle_user_input(question):
-#     response = st.session_state.conversation({'question':question})
-#     st.session_state.chat_history = response('chat_history')
-#     for i,message in enumerate(st.session_state.chat_history):
-#         if i % 2 == 0:
-#             st.write(user_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
-#         else:
-#             st.write(bot_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
-# def get_conversation_chain(vector_store):
-#     llm = ChatOpenAI()
-#     memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
-#     conversation_chain = ConversationalRetrievalChain.from_llm(
-#         llm=llm,
-#         retriever=vector_store.as_retriever(),
-#         memory = memory
-#     )
-#     return conversation_chain
-def save_vector_store(text_chunks):
-    # embeddings = OpenAIEmbeddings()
-    # model = INSTRUCTOR('hkunlp/instructor-base')
-    # embeddings = model.encode(raw_text)
-    embeddings = HuggingFaceInstructEmbeddings()
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-    new_db = FAISS.load_local("faiss_index_V2", embeddings)
-    new_db.merge_from(vectorstore)
-    new_db.save_local('faiss_index_V2')
-    return st.write("vector Store is Saved")
 def button_function(all_text):
     # Add your desired functionality here
     # predictions = []
@@ -76,53 +35,6 @@ def button_function(all_text):
         item['prediction'] = pred
     return all_text
-def get_pdf_text(pdfs,preprocess=True):
-    if preprocess:
-        all_text = []
-        for pdf in pdfs:
-            # Process each uploaded PDF file
-            # Reading PDF
-            pdf_reader = PdfReader(pdf)
-            # Get the filename of the PDF
-            filename = pdf.name
-            text = ""
-            # Reading Each Page
-            for page in pdf_reader.pages:
-                # Extracting Text in Every Page
-                text += page.extract_text()
-            # Preprocess the text
-            text = preprocess_text(text)
-            # Appending to array
-            all_text.append({"filename": filename, "text": text})
-        return all_text
-    else:
-        text = ""
-        for pdf in pdfs:
-            # Process each uploaded PDF file
-            # Reading PDF
-            pdf_reader = PdfReader(pdf)
-            # Reading Each Page
-            for page in pdf_reader.pages:
-                # Extracting Text in Every Page
-                text += page.extract_text()
-        # text = preprocess_text(text)
-        return text
-def filter_keywords(all_text, keywords):
-    filtered_text = []
-    for item in all_text:
-        filename = item['filename']
-        text = item['text']
-        filtered_text_with_keywords = FindKeyWords(keywords, text)
-        filtered_text.append({"filename": filename, "text": filtered_text_with_keywords})
-    return filtered_text
 # Main body
 def main():
     # vector_store = None
@@ -140,17 +52,8 @@ def main():
         # Choose functionality: Prediction or Filtering
         functionality = st.radio("Choose functionality:", ("Make Predictions", "Filter Keywords","Predict the Suitable canditate","Ask Questions"))
-        if functionality == "Ask Questions":
-            if st.button('Process'):
-                with st.spinner("Processing"):
-                    # get pdf text
-                    raw_text = get_pdf_text(pdfs, preprocess=False)
-                    # get the text chunk
-                    text_chunks = get_text_chunks(raw_text)
-                    # create vector store
-                    save_vector_store(text_chunks)
         add_vertical_space(5)
         st.write('Made with ❤️ by Fazni Farook')
@@ -219,25 +122,42 @@ def main():
             embeddings = HuggingFaceInstructEmbeddings()
-            new_db = FAISS.load_local("faiss_index_V2", embeddings)
             st.write(css,unsafe_allow_html=True)
             # create conversation chain
             # st.session_state.conversation = get_conversation_chain(vector_store)
-            question = st.text_input("Ask Question")
             if st.button('Ask Question'):
                 with st.spinner("Processing"):
                     if question:
                         # Convert the question to a vector
-                        question_vector = encode_question(question)
                         # Convert the vector store to a compatible format
-                        output = new_db.similarity_search_by_vector(question_vector)
-                        page_content = output[0].page_content
-                        st.write(page_content)
 if __name__=='__main__':
     main()

 import streamlit as st
 from dotenv import load_dotenv
+from Components.FindKeyword import filter_keywords
+from Components.PreprocessText import get_pdf_text
+from Components.model_Responce import model_prediction
+from Components.GooglePalmChat import get_qa_chain
+from Components.Vector_db import encode_question, save_vector_store
 from streamlit_extras.add_vertical_space import add_vertical_space
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 from htmlTemplates import css, bot_template, user_template
 from InstructorEmbedding import INSTRUCTOR
 import numpy as np
     chunks = text_splitter.split_text(text)
     return chunks
 def button_function(all_text):
     # Add your desired functionality here
     # predictions = []
         item['prediction'] = pred
     return all_text
 # Main body
 def main():
     # vector_store = None
         # Choose functionality: Prediction or Filtering
         functionality = st.radio("Choose functionality:", ("Make Predictions", "Filter Keywords","Predict the Suitable canditate","Ask Questions"))
+        # if functionality == "Ask Questions":
         add_vertical_space(5)
         st.write('Made with ❤️ by Fazni Farook')
             embeddings = HuggingFaceInstructEmbeddings()
+            # new_db = FAISS.load_local("faiss_index_V2", embeddings)
+            if st.button('Create Knowledgebase'):
+                with st.spinner("Processing"):
+                    # embeddings = HuggingFaceInstructEmbeddings()
+                    # get pdf text
+                    raw_text = get_pdf_text(pdfs, preprocess=False)
+                    # get the text chunk
+                    text_chunks = get_text_chunks(raw_text)
+                    # create vector store
+                    save_vector_store(text_chunks,embeddings)
             st.write(css,unsafe_allow_html=True)
             # create conversation chain
             # st.session_state.conversation = get_conversation_chain(vector_store)
+            question = st.text_input("Ask Question: ")
             if st.button('Ask Question'):
                 with st.spinner("Processing"):
                     if question:
                         # Convert the question to a vector
+                        # question_vector = encode_question(question,embeddings)
                         # Convert the vector store to a compatible format
+                        # output = new_db.similarity_search_by_vector(question_vector)
+                        # page_content = output[0].page_content
+                        # Asking Questions using Google Palm
+                        chain = get_qa_chain(embeddings)
+                        response = chain(question)
+                        st.header("Answer: ")
+                        st.write(response["result"])
 if __name__=='__main__':
     main()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libgl1

requirements.txt CHANGED Viewed

@@ -14,4 +14,6 @@ openai
 huggingface_hub
 InstructorEmbedding
 torch
-sentence_transformers

 huggingface_hub
 InstructorEmbedding
 torch
+sentence_transformers
+google-generativeai
+protobuf~=3.19.0