Spaces:
Sleeping
Sleeping
palitrajarshi
commited on
Commit
·
e9f8bde
1
Parent(s):
319051d
Upload 5 files
Browse files- app.py +72 -0
- constants (1).py +3 -0
- env-example.txt +2 -0
- requirements.txt +12 -0
- utils.py +110 -0
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import constants
|
3 |
+
from utils import *
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
#Creating session variables
|
7 |
+
if 'unique_id' not in st.session_state:
|
8 |
+
st.session_state['unique_id'] =''
|
9 |
+
|
10 |
+
def main():
|
11 |
+
|
12 |
+
st.set_page_config(page_title="Resume Screening Assistance")
|
13 |
+
st.title("HR - Resume Screening Assistance...💁 ")
|
14 |
+
st.subheader("I can help you in resume screening process")
|
15 |
+
|
16 |
+
job_description = st.text_area("Please paste the 'JOB DESCRIPTION' here...",key="1")
|
17 |
+
document_count = st.text_input("No.of 'RESUMES' to return",key="2")
|
18 |
+
# Upload the Resumes (pdf files)
|
19 |
+
pdf = st.file_uploader("Upload resumes here, only PDF files allowed", type=["pdf"],accept_multiple_files=True)
|
20 |
+
|
21 |
+
submit=st.button("Help me with the analysis")
|
22 |
+
|
23 |
+
if submit:
|
24 |
+
with st.spinner('Wait for it...'):
|
25 |
+
|
26 |
+
#Creating a unique ID, so that we can use to query and get only the user uploaded documents from PINECONE vector store
|
27 |
+
st.session_state['unique_id']=uuid.uuid4().hex
|
28 |
+
|
29 |
+
#Create a documents list out of all the user uploaded pdf files
|
30 |
+
final_docs_list=create_docs(pdf,st.session_state['unique_id'])
|
31 |
+
#st.write(final_docs_list)
|
32 |
+
|
33 |
+
#Displaying the count of resumes that have been uploaded
|
34 |
+
st.write("*Resumes uploaded* :"+str(len(final_docs_list)))
|
35 |
+
|
36 |
+
#Create embeddings instance
|
37 |
+
embeddings=create_embeddings_load_data()
|
38 |
+
|
39 |
+
#Push data to PINECONE
|
40 |
+
#push_to_pinecone(constants.PINECONE_API_KEY,constants.PINECONE_ENVIRONMENT,constants.PINECONE_INDEX,embeddings,final_docs_list)
|
41 |
+
|
42 |
+
#Fecth relavant documents from PINECONE
|
43 |
+
#relavant_docs=similar_docs(job_description,document_count,constants.PINECONE_API_KEY,constants.PINECONE_ENVIRONMENT,constants.PINECONE_INDEX,embeddings,st.session_state['unique_id'])
|
44 |
+
relavant_docs=close_matches(job_description,document_count,final_docs_list,embeddings)
|
45 |
+
#st.write(relavant_docs)
|
46 |
+
|
47 |
+
#Introducing a line separator
|
48 |
+
st.write(":heavy_minus_sign:" * 30)
|
49 |
+
|
50 |
+
#For each item in relavant docs - we are displaying some info of it on the UI
|
51 |
+
for item in range(len(relavant_docs)):
|
52 |
+
|
53 |
+
st.subheader("👉 "+str(item+1))
|
54 |
+
|
55 |
+
#Displaying Filepath
|
56 |
+
st.write("**File** : "+relavant_docs[item][0].metadata['name'])
|
57 |
+
|
58 |
+
#Introducing Expander feature
|
59 |
+
with st.expander('Show me 👀'):
|
60 |
+
st.info("**Match Score** : "+ str(1 - relavant_docs[item][1]))
|
61 |
+
#st.write("***"+relavant_docs[item][0].page_content)
|
62 |
+
|
63 |
+
#Gets the summary of the current item using 'get_summary' function that we have created which uses LLM & Langchain chain
|
64 |
+
summary = get_summary(relavant_docs[item][0])
|
65 |
+
st.write("**Summary** : "+summary)
|
66 |
+
|
67 |
+
st.success("Hope I was able to save your time❤️")
|
68 |
+
|
69 |
+
|
70 |
+
#Invoking main function
|
71 |
+
if __name__ == '__main__':
|
72 |
+
main()
|
constants (1).py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
PINECONE_API_KEY="a4405723-2309-4c5c-87d0-760f461fdef0"
|
2 |
+
PINECONE_ENVIRONMENT="gcp-starter"
|
3 |
+
PINECONE_INDEX="hresume"
|
env-example.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=""
|
2 |
+
HUGGINGFACEHUB_API_TOKEN=""
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
streamlit
|
3 |
+
openai
|
4 |
+
tiktoken
|
5 |
+
python-dotenv
|
6 |
+
unstructured
|
7 |
+
pinecone-client
|
8 |
+
pypdf
|
9 |
+
sentence_transformers
|
10 |
+
pdf2image
|
11 |
+
pdfminer.six
|
12 |
+
faiss-cpu
|
utils.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
3 |
+
from langchain.vectorstores import Pinecone
|
4 |
+
from langchain.llms import OpenAI
|
5 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
6 |
+
from langchain.schema import Document
|
7 |
+
import pinecone
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from pypdf import PdfReader
|
10 |
+
from langchain.llms.openai import OpenAI
|
11 |
+
from langchain.chains.summarize import load_summarize_chain
|
12 |
+
from langchain import HuggingFaceHub
|
13 |
+
from langchain.document_loaders import DirectoryLoader
|
14 |
+
|
15 |
+
|
16 |
+
#Extract Information from PDF file
|
17 |
+
def get_pdf_text(pdf_doc):
|
18 |
+
text = ""
|
19 |
+
pdf_reader = PdfReader(pdf_doc)
|
20 |
+
for page in pdf_reader.pages:
|
21 |
+
text += page.extract_text()
|
22 |
+
return text
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
# iterate over files in
|
27 |
+
# that user uploaded PDF files, one by one
|
28 |
+
def create_docs(user_pdf_list, unique_id):
|
29 |
+
docs=[]
|
30 |
+
for filename in user_pdf_list:
|
31 |
+
|
32 |
+
chunks=get_pdf_text(filename)
|
33 |
+
|
34 |
+
#Adding items to our list - Adding data & its metadata
|
35 |
+
docs.append(Document(
|
36 |
+
page_content=chunks,
|
37 |
+
metadata={"name": filename.name,"id":filename.id,"type=":filename.type,"size":filename.size,"unique_id":unique_id},
|
38 |
+
))
|
39 |
+
# Load Files from Directory (Local Version)
|
40 |
+
#loader = DirectoryLoader('./Repository', glob='**/*')
|
41 |
+
#docs1 = loader.load()
|
42 |
+
#final_docs = docs + docs1
|
43 |
+
return docs
|
44 |
+
|
45 |
+
|
46 |
+
#Create embeddings instance
|
47 |
+
def create_embeddings_load_data():
|
48 |
+
embeddings = OpenAIEmbeddings()
|
49 |
+
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
50 |
+
return embeddings
|
51 |
+
|
52 |
+
|
53 |
+
#Function to push data to Vector Store - Pinecone here
|
54 |
+
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
|
55 |
+
|
56 |
+
pinecone.init(
|
57 |
+
api_key=pinecone_apikey,
|
58 |
+
environment=pinecone_environment
|
59 |
+
)
|
60 |
+
print("done......2")
|
61 |
+
Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
#Function to pull infrmation from Vector Store - Pinecone here
|
66 |
+
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
|
67 |
+
|
68 |
+
pinecone.init(
|
69 |
+
api_key=pinecone_apikey,
|
70 |
+
environment=pinecone_environment
|
71 |
+
)
|
72 |
+
|
73 |
+
index_name = pinecone_index_name
|
74 |
+
|
75 |
+
index = Pinecone.from_existing_index(index_name, embeddings)
|
76 |
+
return index
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
#Function to help us get relavant documents from vector store - based on user input
|
81 |
+
def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
|
82 |
+
|
83 |
+
pinecone.init(
|
84 |
+
api_key=pinecone_apikey,
|
85 |
+
environment=pinecone_environment
|
86 |
+
)
|
87 |
+
|
88 |
+
index_name = pinecone_index_name
|
89 |
+
|
90 |
+
index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
|
91 |
+
#similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
|
92 |
+
similar_docs = index.similarity_search_with_score(query, int(k))
|
93 |
+
#print(similar_docs)
|
94 |
+
return similar_docs
|
95 |
+
|
96 |
+
def close_matches(query,k,docs,embeddings):
|
97 |
+
#https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.faiss.FAISS.html#langchain.vectorstores.faiss.FAISS.similarity_search_with_score
|
98 |
+
db = FAISS.from_documents(docs, embeddings)
|
99 |
+
similar_docs = db.similarity_search_with_score(query, int(k))
|
100 |
+
return similar_docs
|
101 |
+
|
102 |
+
|
103 |
+
# Helps us get the summary of a document
|
104 |
+
def get_summary(current_doc):
|
105 |
+
llm = OpenAI(temperature=0)
|
106 |
+
#llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
|
107 |
+
chain = load_summarize_chain(llm, chain_type="map_reduce")
|
108 |
+
summary = chain.run([current_doc])
|
109 |
+
|
110 |
+
return summary
|