fazni commited on
Commit
5ad9f7c
β€’
1 Parent(s): 836b34d

added google palm model

Browse files
FindKeyword.py β†’ Components/FindKeyword.py RENAMED
@@ -9,3 +9,12 @@ def FindKeyWords(keywords, text):
9
  return "Keyword not found in the Resume."
10
 
11
  return highlighted_text
 
 
 
 
 
 
 
 
 
 
9
  return "Keyword not found in the Resume."
10
 
11
  return highlighted_text
12
+
13
+ def filter_keywords(all_text, keywords):
14
+ filtered_text = []
15
+ for item in all_text:
16
+ filename = item['filename']
17
+ text = item['text']
18
+ filtered_text_with_keywords = FindKeyWords(keywords, text)
19
+ filtered_text.append({"filename": filename, "text": filtered_text_with_keywords})
20
+ return filtered_text
Components/GooglePalmChat.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain.llms import GooglePalm
4
+ from langchain.chains import RetrievalQA
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.prompts import PromptTemplate
7
+
8
+ load_dotenv() # take environment variables from .env (especially openai api key)
9
+
10
+ # Create Google Palm LLM model
11
+ llm = GooglePalm(google_api_key=os.environ["GOOGLE_PALM_API"], temperature=0.1)
12
+ vectordb_file_path = "faiss_index_V2"
13
+
14
+ def get_qa_chain(embeddings):
15
+ # Load the vector database from the local folder
16
+ vectordb = FAISS.load_local(vectordb_file_path, embeddings)
17
+
18
+ # Create a retriever for querying the vector database
19
+ retriever = vectordb.as_retriever(score_threshold=0.7)
20
+
21
+ prompt_template = """Given the following context and a question, generate an answer based on this context only.
22
+ In the answer try to provide as much text as possible from the source document context without making much changes.
23
+ If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.
24
+
25
+ CONTEXT: {context}
26
+
27
+ QUESTION: {question}"""
28
+
29
+ PROMPT = PromptTemplate(
30
+ template=prompt_template, input_variables=["context", "question"]
31
+ )
32
+
33
+ chain = RetrievalQA.from_chain_type(llm=llm,
34
+ chain_type="stuff",
35
+ retriever=retriever,
36
+ input_key="query",
37
+ return_source_documents=True,
38
+ chain_type_kwargs={"prompt": PROMPT})
39
+
40
+ return chain
Components/OpenAIChat.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # def handle_user_input(question):
3
+ # response = st.session_state.conversation({'question':question})
4
+ # st.session_state.chat_history = response('chat_history')
5
+
6
+ # for i,message in enumerate(st.session_state.chat_history):
7
+ # if i % 2 == 0:
8
+ # st.write(user_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
9
+ # else:
10
+ # st.write(bot_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
11
+
12
+ # def get_conversation_chain(vector_store):
13
+ # llm = ChatOpenAI()
14
+ # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
15
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
16
+ # llm=llm,
17
+ # retriever=vector_store.as_retriever(),
18
+ # memory = memory
19
+ # )
20
+ # return conversation_chain
PreprocessText.py β†’ Components/PreprocessText.py RENAMED
@@ -1,4 +1,5 @@
1
  import re
 
2
 
3
  def preprocess_text(text):
4
  # Remove newlines and tabs
@@ -25,4 +26,41 @@ def preprocess_text(text):
25
  # Add a space before a word containing a capital letter in the middle
26
  text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
27
 
28
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ from PyPDF2 import PdfReader
3
 
4
  def preprocess_text(text):
5
  # Remove newlines and tabs
 
26
  # Add a space before a word containing a capital letter in the middle
27
  text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
28
 
29
+ return text
30
+
31
+ def get_pdf_text(pdfs,preprocess=True):
32
+ if preprocess:
33
+ all_text = []
34
+ for pdf in pdfs:
35
+ # Process each uploaded PDF file
36
+ # Reading PDF
37
+ pdf_reader = PdfReader(pdf)
38
+
39
+ # Get the filename of the PDF
40
+ filename = pdf.name
41
+
42
+ text = ""
43
+ # Reading Each Page
44
+ for page in pdf_reader.pages:
45
+ # Extracting Text in Every Page
46
+ text += page.extract_text()
47
+ # Preprocess the text
48
+ text = preprocess_text(text)
49
+ # Appending to array
50
+ all_text.append({"filename": filename, "text": text})
51
+ return all_text
52
+
53
+ else:
54
+ text = ""
55
+ for pdf in pdfs:
56
+ # Process each uploaded PDF file
57
+ # Reading PDF
58
+ pdf_reader = PdfReader(pdf)
59
+
60
+ # Reading Each Page
61
+ for page in pdf_reader.pages:
62
+ # Extracting Text in Every Page
63
+ text += page.extract_text()
64
+
65
+ # text = preprocess_text(text)
66
+ return text
Components/Vector_db.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.vectorstores import FAISS
3
+ # from langchain.chat_models import ChatOpenAI
4
+ # from langchain.memory import ConversationBufferMemory
5
+ # from langchain.chains import ConversationalRetrievalChain
6
+
7
+ # Assuming this function encodes the question into a vector representation
8
+ def encode_question(question,embeddings):
9
+ # embeddings = HuggingFaceInstructEmbeddings() # Instantiate the embeddings model
10
+ question_vector = embeddings.embed_query(question) # Encode the question into a vector
11
+ return question_vector
12
+
13
+ def save_vector_store(text_chunks,embeddings):
14
+ # embeddings = OpenAIEmbeddings()
15
+ # model = INSTRUCTOR('hkunlp/instructor-base')
16
+ # embeddings = model.encode(raw_text)
17
+ # embeddings = HuggingFaceInstructEmbeddings()
18
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
19
+ new_db = FAISS.load_local("faiss_index_V2", embeddings)
20
+ new_db.merge_from(vectorstore)
21
+ new_db.save_local('faiss_index_V2')
22
+
23
+ return st.write("vector Store is Saved")
Components/__pycache__/FindKeyword.cpython-310.pyc ADDED
Binary file (851 Bytes). View file
 
Components/__pycache__/GooglePalmChat.cpython-310.pyc ADDED
Binary file (1.45 kB). View file
 
Components/__pycache__/PreprocessText.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
Components/__pycache__/Vector_db.cpython-310.pyc ADDED
Binary file (735 Bytes). View file
 
Components/__pycache__/model_Responce.cpython-310.pyc ADDED
Binary file (1.21 kB). View file
 
model_Responce.py β†’ Components/model_Responce.py RENAMED
@@ -7,14 +7,13 @@ from keras.preprocessing.text import Tokenizer
7
 
8
  # Load the model from the pickle file
9
  # filename = 'F:/CVFilter/models/model_pk.pkl'
10
-
11
  # with open(filename, 'rb') as file:
12
  # model = pickle.load(file)
13
 
14
  # Load the saved model
15
  # model = joblib.load('F:\CVFilter\models\model.joblib')
16
 
17
- model = tf.keras.models.load_model('models/model.h5')
18
 
19
  tokenfile = 'tokenized_words/tokenized_words.pkl'
20
  # Load the tokenized words from the pickle file
 
7
 
8
  # Load the model from the pickle file
9
  # filename = 'F:/CVFilter/models/model_pk.pkl'
 
10
  # with open(filename, 'rb') as file:
11
  # model = pickle.load(file)
12
 
13
  # Load the saved model
14
  # model = joblib.load('F:\CVFilter\models\model.joblib')
15
 
16
+ model = tf.keras.models.load_model('models\model.h5')
17
 
18
  tokenfile = 'tokenized_words/tokenized_words.pkl'
19
  # Load the tokenized words from the pickle file
app.py CHANGED
@@ -1,17 +1,14 @@
1
- import re
2
  import streamlit as st
3
- from PyPDF2 import PdfReader
4
  from dotenv import load_dotenv
5
- from FindKeyword import FindKeyWords
6
- from PreprocessText import preprocess_text
7
- from model_Responce import model_prediction
 
 
8
  from streamlit_extras.add_vertical_space import add_vertical_space
9
  from langchain.text_splitter import CharacterTextSplitter
10
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
11
  from langchain.vectorstores import FAISS
12
- # from langchain.chat_models import ChatOpenAI
13
- # from langchain.memory import ConversationBufferMemory
14
- # from langchain.chains import ConversationalRetrievalChain
15
  from htmlTemplates import css, bot_template, user_template
16
  from InstructorEmbedding import INSTRUCTOR
17
  import numpy as np
@@ -27,44 +24,6 @@ def get_text_chunks(text):
27
  chunks = text_splitter.split_text(text)
28
  return chunks
29
 
30
- # Assuming this function encodes the question into a vector representation
31
- def encode_question(question):
32
- embeddings = HuggingFaceInstructEmbeddings() # Instantiate the embeddings model
33
- question_vector = embeddings.embed_query(question) # Encode the question into a vector
34
- return question_vector
35
-
36
- # def handle_user_input(question):
37
- # response = st.session_state.conversation({'question':question})
38
- # st.session_state.chat_history = response('chat_history')
39
-
40
- # for i,message in enumerate(st.session_state.chat_history):
41
- # if i % 2 == 0:
42
- # st.write(user_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
43
- # else:
44
- # st.write(bot_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
45
-
46
- # def get_conversation_chain(vector_store):
47
- # llm = ChatOpenAI()
48
- # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
49
- # conversation_chain = ConversationalRetrievalChain.from_llm(
50
- # llm=llm,
51
- # retriever=vector_store.as_retriever(),
52
- # memory = memory
53
- # )
54
- # return conversation_chain
55
-
56
- def save_vector_store(text_chunks):
57
- # embeddings = OpenAIEmbeddings()
58
- # model = INSTRUCTOR('hkunlp/instructor-base')
59
- # embeddings = model.encode(raw_text)
60
- embeddings = HuggingFaceInstructEmbeddings()
61
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
62
- new_db = FAISS.load_local("faiss_index_V2", embeddings)
63
- new_db.merge_from(vectorstore)
64
- new_db.save_local('faiss_index_V2')
65
-
66
- return st.write("vector Store is Saved")
67
-
68
  def button_function(all_text):
69
  # Add your desired functionality here
70
  # predictions = []
@@ -76,53 +35,6 @@ def button_function(all_text):
76
  item['prediction'] = pred
77
  return all_text
78
 
79
- def get_pdf_text(pdfs,preprocess=True):
80
- if preprocess:
81
- all_text = []
82
- for pdf in pdfs:
83
- # Process each uploaded PDF file
84
- # Reading PDF
85
- pdf_reader = PdfReader(pdf)
86
-
87
- # Get the filename of the PDF
88
- filename = pdf.name
89
-
90
- text = ""
91
- # Reading Each Page
92
- for page in pdf_reader.pages:
93
- # Extracting Text in Every Page
94
- text += page.extract_text()
95
- # Preprocess the text
96
- text = preprocess_text(text)
97
- # Appending to array
98
- all_text.append({"filename": filename, "text": text})
99
- return all_text
100
-
101
- else:
102
- text = ""
103
- for pdf in pdfs:
104
- # Process each uploaded PDF file
105
- # Reading PDF
106
- pdf_reader = PdfReader(pdf)
107
-
108
- # Reading Each Page
109
- for page in pdf_reader.pages:
110
- # Extracting Text in Every Page
111
- text += page.extract_text()
112
-
113
- # text = preprocess_text(text)
114
- return text
115
-
116
- def filter_keywords(all_text, keywords):
117
- filtered_text = []
118
- for item in all_text:
119
- filename = item['filename']
120
- text = item['text']
121
- filtered_text_with_keywords = FindKeyWords(keywords, text)
122
- filtered_text.append({"filename": filename, "text": filtered_text_with_keywords})
123
- return filtered_text
124
-
125
-
126
  # Main body
127
  def main():
128
  # vector_store = None
@@ -140,17 +52,8 @@ def main():
140
 
141
  # Choose functionality: Prediction or Filtering
142
  functionality = st.radio("Choose functionality:", ("Make Predictions", "Filter Keywords","Predict the Suitable canditate","Ask Questions"))
143
- if functionality == "Ask Questions":
144
- if st.button('Process'):
145
- with st.spinner("Processing"):
146
- # get pdf text
147
- raw_text = get_pdf_text(pdfs, preprocess=False)
148
-
149
- # get the text chunk
150
- text_chunks = get_text_chunks(raw_text)
151
-
152
- # create vector store
153
- save_vector_store(text_chunks)
154
  add_vertical_space(5)
155
  st.write('Made with ❀️ by Fazni Farook')
156
 
@@ -219,25 +122,42 @@ def main():
219
 
220
  embeddings = HuggingFaceInstructEmbeddings()
221
 
222
- new_db = FAISS.load_local("faiss_index_V2", embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  st.write(css,unsafe_allow_html=True)
225
 
226
  # create conversation chain
227
  # st.session_state.conversation = get_conversation_chain(vector_store)
228
 
229
- question = st.text_input("Ask Question")
230
 
231
  if st.button('Ask Question'):
232
  with st.spinner("Processing"):
233
  if question:
234
  # Convert the question to a vector
235
- question_vector = encode_question(question)
236
 
237
  # Convert the vector store to a compatible format
238
- output = new_db.similarity_search_by_vector(question_vector)
239
- page_content = output[0].page_content
240
- st.write(page_content)
 
 
 
 
 
241
 
242
  if __name__=='__main__':
243
  main()
 
 
1
  import streamlit as st
 
2
  from dotenv import load_dotenv
3
+ from Components.FindKeyword import filter_keywords
4
+ from Components.PreprocessText import get_pdf_text
5
+ from Components.model_Responce import model_prediction
6
+ from Components.GooglePalmChat import get_qa_chain
7
+ from Components.Vector_db import encode_question, save_vector_store
8
  from streamlit_extras.add_vertical_space import add_vertical_space
9
  from langchain.text_splitter import CharacterTextSplitter
10
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
11
  from langchain.vectorstores import FAISS
 
 
 
12
  from htmlTemplates import css, bot_template, user_template
13
  from InstructorEmbedding import INSTRUCTOR
14
  import numpy as np
 
24
  chunks = text_splitter.split_text(text)
25
  return chunks
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def button_function(all_text):
28
  # Add your desired functionality here
29
  # predictions = []
 
35
  item['prediction'] = pred
36
  return all_text
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Main body
39
  def main():
40
  # vector_store = None
 
52
 
53
  # Choose functionality: Prediction or Filtering
54
  functionality = st.radio("Choose functionality:", ("Make Predictions", "Filter Keywords","Predict the Suitable canditate","Ask Questions"))
55
+ # if functionality == "Ask Questions":
56
+
 
 
 
 
 
 
 
 
 
57
  add_vertical_space(5)
58
  st.write('Made with ❀️ by Fazni Farook')
59
 
 
122
 
123
  embeddings = HuggingFaceInstructEmbeddings()
124
 
125
+ # new_db = FAISS.load_local("faiss_index_V2", embeddings)
126
+
127
+ if st.button('Create Knowledgebase'):
128
+ with st.spinner("Processing"):
129
+ # embeddings = HuggingFaceInstructEmbeddings()
130
+ # get pdf text
131
+ raw_text = get_pdf_text(pdfs, preprocess=False)
132
+
133
+ # get the text chunk
134
+ text_chunks = get_text_chunks(raw_text)
135
+
136
+ # create vector store
137
+ save_vector_store(text_chunks,embeddings)
138
 
139
  st.write(css,unsafe_allow_html=True)
140
 
141
  # create conversation chain
142
  # st.session_state.conversation = get_conversation_chain(vector_store)
143
 
144
+ question = st.text_input("Ask Question: ")
145
 
146
  if st.button('Ask Question'):
147
  with st.spinner("Processing"):
148
  if question:
149
  # Convert the question to a vector
150
+ # question_vector = encode_question(question,embeddings)
151
 
152
  # Convert the vector store to a compatible format
153
+ # output = new_db.similarity_search_by_vector(question_vector)
154
+ # page_content = output[0].page_content
155
+
156
+ # Asking Questions using Google Palm
157
+ chain = get_qa_chain(embeddings)
158
+ response = chain(question)
159
+ st.header("Answer: ")
160
+ st.write(response["result"])
161
 
162
  if __name__=='__main__':
163
  main()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libgl1
requirements.txt CHANGED
@@ -14,4 +14,6 @@ openai
14
  huggingface_hub
15
  InstructorEmbedding
16
  torch
17
- sentence_transformers
 
 
 
14
  huggingface_hub
15
  InstructorEmbedding
16
  torch
17
+ sentence_transformers
18
+ google-generativeai
19
+ protobuf~=3.19.0