palitrajarshi commited on
Commit
e9f8bde
·
1 Parent(s): 319051d

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +72 -0
  2. constants (1).py +3 -0
  3. env-example.txt +2 -0
  4. requirements.txt +12 -0
  5. utils.py +110 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import constants
3
+ from utils import *
4
+ import uuid
5
+
6
+ #Creating session variables
7
+ if 'unique_id' not in st.session_state:
8
+ st.session_state['unique_id'] =''
9
+
10
+ def main():
11
+
12
+ st.set_page_config(page_title="Resume Screening Assistance")
13
+ st.title("HR - Resume Screening Assistance...💁 ")
14
+ st.subheader("I can help you in resume screening process")
15
+
16
+ job_description = st.text_area("Please paste the 'JOB DESCRIPTION' here...",key="1")
17
+ document_count = st.text_input("No.of 'RESUMES' to return",key="2")
18
+ # Upload the Resumes (pdf files)
19
+ pdf = st.file_uploader("Upload resumes here, only PDF files allowed", type=["pdf"],accept_multiple_files=True)
20
+
21
+ submit=st.button("Help me with the analysis")
22
+
23
+ if submit:
24
+ with st.spinner('Wait for it...'):
25
+
26
+ #Creating a unique ID, so that we can use to query and get only the user uploaded documents from PINECONE vector store
27
+ st.session_state['unique_id']=uuid.uuid4().hex
28
+
29
+ #Create a documents list out of all the user uploaded pdf files
30
+ final_docs_list=create_docs(pdf,st.session_state['unique_id'])
31
+ #st.write(final_docs_list)
32
+
33
+ #Displaying the count of resumes that have been uploaded
34
+ st.write("*Resumes uploaded* :"+str(len(final_docs_list)))
35
+
36
+ #Create embeddings instance
37
+ embeddings=create_embeddings_load_data()
38
+
39
+ #Push data to PINECONE
40
+ #push_to_pinecone(constants.PINECONE_API_KEY,constants.PINECONE_ENVIRONMENT,constants.PINECONE_INDEX,embeddings,final_docs_list)
41
+
42
+ #Fecth relavant documents from PINECONE
43
+ #relavant_docs=similar_docs(job_description,document_count,constants.PINECONE_API_KEY,constants.PINECONE_ENVIRONMENT,constants.PINECONE_INDEX,embeddings,st.session_state['unique_id'])
44
+ relavant_docs=close_matches(job_description,document_count,final_docs_list,embeddings)
45
+ #st.write(relavant_docs)
46
+
47
+ #Introducing a line separator
48
+ st.write(":heavy_minus_sign:" * 30)
49
+
50
+ #For each item in relavant docs - we are displaying some info of it on the UI
51
+ for item in range(len(relavant_docs)):
52
+
53
+ st.subheader("👉 "+str(item+1))
54
+
55
+ #Displaying Filepath
56
+ st.write("**File** : "+relavant_docs[item][0].metadata['name'])
57
+
58
+ #Introducing Expander feature
59
+ with st.expander('Show me 👀'):
60
+ st.info("**Match Score** : "+ str(1 - relavant_docs[item][1]))
61
+ #st.write("***"+relavant_docs[item][0].page_content)
62
+
63
+ #Gets the summary of the current item using 'get_summary' function that we have created which uses LLM & Langchain chain
64
+ summary = get_summary(relavant_docs[item][0])
65
+ st.write("**Summary** : "+summary)
66
+
67
+ st.success("Hope I was able to save your time❤️")
68
+
69
+
70
+ #Invoking main function
71
+ if __name__ == '__main__':
72
+ main()
constants (1).py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PINECONE_API_KEY="a4405723-2309-4c5c-87d0-760f461fdef0"
2
+ PINECONE_ENVIRONMENT="gcp-starter"
3
+ PINECONE_INDEX="hresume"
env-example.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=""
2
+ HUGGINGFACEHUB_API_TOKEN=""
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ streamlit
3
+ openai
4
+ tiktoken
5
+ python-dotenv
6
+ unstructured
7
+ pinecone-client
8
+ pypdf
9
+ sentence_transformers
10
+ pdf2image
11
+ pdfminer.six
12
+ faiss-cpu
utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.vectorstores import Pinecone
4
+ from langchain.llms import OpenAI
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+ from langchain.schema import Document
7
+ import pinecone
8
+ from langchain.vectorstores import FAISS
9
+ from pypdf import PdfReader
10
+ from langchain.llms.openai import OpenAI
11
+ from langchain.chains.summarize import load_summarize_chain
12
+ from langchain import HuggingFaceHub
13
+ from langchain.document_loaders import DirectoryLoader
14
+
15
+
16
+ #Extract Information from PDF file
17
+ def get_pdf_text(pdf_doc):
18
+ text = ""
19
+ pdf_reader = PdfReader(pdf_doc)
20
+ for page in pdf_reader.pages:
21
+ text += page.extract_text()
22
+ return text
23
+
24
+
25
+
26
+ # iterate over files in
27
+ # that user uploaded PDF files, one by one
28
+ def create_docs(user_pdf_list, unique_id):
29
+ docs=[]
30
+ for filename in user_pdf_list:
31
+
32
+ chunks=get_pdf_text(filename)
33
+
34
+ #Adding items to our list - Adding data & its metadata
35
+ docs.append(Document(
36
+ page_content=chunks,
37
+ metadata={"name": filename.name,"id":filename.id,"type=":filename.type,"size":filename.size,"unique_id":unique_id},
38
+ ))
39
+ # Load Files from Directory (Local Version)
40
+ #loader = DirectoryLoader('./Repository', glob='**/*')
41
+ #docs1 = loader.load()
42
+ #final_docs = docs + docs1
43
+ return docs
44
+
45
+
46
+ #Create embeddings instance
47
+ def create_embeddings_load_data():
48
+ embeddings = OpenAIEmbeddings()
49
+ #embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
50
+ return embeddings
51
+
52
+
53
+ #Function to push data to Vector Store - Pinecone here
54
+ def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
55
+
56
+ pinecone.init(
57
+ api_key=pinecone_apikey,
58
+ environment=pinecone_environment
59
+ )
60
+ print("done......2")
61
+ Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
62
+
63
+
64
+
65
+ #Function to pull infrmation from Vector Store - Pinecone here
66
+ def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
67
+
68
+ pinecone.init(
69
+ api_key=pinecone_apikey,
70
+ environment=pinecone_environment
71
+ )
72
+
73
+ index_name = pinecone_index_name
74
+
75
+ index = Pinecone.from_existing_index(index_name, embeddings)
76
+ return index
77
+
78
+
79
+
80
+ #Function to help us get relavant documents from vector store - based on user input
81
+ def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
82
+
83
+ pinecone.init(
84
+ api_key=pinecone_apikey,
85
+ environment=pinecone_environment
86
+ )
87
+
88
+ index_name = pinecone_index_name
89
+
90
+ index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
91
+ #similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
92
+ similar_docs = index.similarity_search_with_score(query, int(k))
93
+ #print(similar_docs)
94
+ return similar_docs
95
+
96
+ def close_matches(query,k,docs,embeddings):
97
+ #https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.faiss.FAISS.html#langchain.vectorstores.faiss.FAISS.similarity_search_with_score
98
+ db = FAISS.from_documents(docs, embeddings)
99
+ similar_docs = db.similarity_search_with_score(query, int(k))
100
+ return similar_docs
101
+
102
+
103
+ # Helps us get the summary of a document
104
+ def get_summary(current_doc):
105
+ llm = OpenAI(temperature=0)
106
+ #llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
107
+ chain = load_summarize_chain(llm, chain_type="map_reduce")
108
+ summary = chain.run([current_doc])
109
+
110
+ return summary