Spaces:
Sleeping
Sleeping
File size: 5,729 Bytes
54f4f78 b2045eb 54f4f78 b2045eb 54f4f78 14cf189 54f4f78 b2045eb 54f4f78 b2045eb 54f4f78 b2045eb 54f4f78 b2045eb 54f4f78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import requests
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
# Langchain packages
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
from constants import StreamlitException
from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID
from streamlit import cache_data
# Function to summarize resume text
@cache_data(show_spinner=False)
def summarize_text(text, max_length=100):
if text != '':
data = json.dumps(
{
"inputs": text,
"parameters": {"max_length": max_length}
}
)
response = requests.post(API_URL_summary, headers=HEADERS, data=data)
if response.status_code != 200:
return StreamlitException(f"**Error**: {response.status_code}")
try:
summary = response.json()[0]["generated_text"]
except (KeyError, IndexError):
return StreamlitException("**Error**: Invalid response from API.")
return summary
else:
return 'nan'
# Function to extract candidate name(s) from resume text
@cache_data(show_spinner=False)
def extract_person_names_and_email(text):
print(text)
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
data = json.dumps({"inputs": [text]})
response = requests.post(API_URL_name, headers=HEADERS, data=data)
output = json.loads(response.content.decode("utf-8"))
print(output)
person_names = set()
for text in output[0]:
if text["entity_group"] == "PER":
person_names.add(text["word"])
# Extract email addresses
print(text)
return set(person_names), set(emails)
# Function to extract key technical skills from resume text
def extract_tech_skills(_doc):
keywords = [token.text.upper() for token in _doc if token.text.lower() in TECH_SKILLS]
return set(keywords)
# Function to calculate overall percentage match between job description and resume
@cache_data(show_spinner=False)
def calculate_similarity(job_description, resume):
if job_description != '':
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
job_description_embeddings = model.encode(job_description)
resume_embeddings = model.encode(resume)
similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
return similarity_score[0][0] * 100
else:
return np.NaN
# Define a function to clean sentences
def clean_text(text):
# Remove bullet points
text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
# Remove more types of bullet points
text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
# Remove extra new lines
text = re.sub(r'\n+', '\n', text).strip()
# Remove any leading/trailing newlines
text = text.strip('\n')
# Remove any leading/trailing spaces
text = text.strip()
# Replace pipe symbol with a dot
text = re.sub(r'\s*\|\s*', '. ', text).strip()
# Add full stops to the end of each sentence
text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
# Capitalize the first letter of each sentence
text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
# Replace ' - ' with '. ' only if it's not part of a hyphenated word
text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
# Return cleaned text
return text
# Define a function to split sentences based on regular expressions
def split_text(string):
# Split the clean string into sentences
sentences = sent_tokenize(string)
return sentences
# Function to calculate overall percentage match
@cache_data(show_spinner=False)
def get_average_similarity_scores(job_description, resumes):
# Calculate cosine similarity matrix between job description and resumes
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
job_description_embeddings = model.encode(job_description)
resume_embeddings = model.encode(resumes)
similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
# Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
avg_similarity_scores = np.mean(similarity_matrix, axis=1)
# Return the average similarity scores as a list
return avg_similarity_scores.tolist()
# Function to respond to user Q&A
def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
pages = loader.load_and_split()
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(pages)
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embeddings)
llm = HuggingFaceHub(
repo_id=LLM_REPO_ID, model_kwargs={
"temperature": temperature, "max_length": max_length
})
chain = load_qa_chain(llm, chain_type="stuff")
docs = db.similarity_search(query)
return chain.run(input_documents=docs, question=query)
# Load the English language model for spaCy
lang_model = spacy.load("en_core_web_sm")
|