ResuMate_NVIDIA / nlp.py
cm0805's picture
Update nlp.py
14cf189 verified
import requests
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
# Langchain packages
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
from constants import StreamlitException
from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID
from streamlit import cache_data
# Function to summarize resume text
@cache_data(show_spinner=False)
def summarize_text(text, max_length=100):
if text != '':
data = json.dumps(
{
"inputs": text,
"parameters": {"max_length": max_length}
}
)
response = requests.post(API_URL_summary, headers=HEADERS, data=data)
if response.status_code != 200:
return StreamlitException(f"**Error**: {response.status_code}")
try:
summary = response.json()[0]["generated_text"]
except (KeyError, IndexError):
return StreamlitException("**Error**: Invalid response from API.")
return summary
else:
return 'nan'
# Function to extract candidate name(s) from resume text
@cache_data(show_spinner=False)
def extract_person_names_and_email(text):
print(text)
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
data = json.dumps({"inputs": [text]})
response = requests.post(API_URL_name, headers=HEADERS, data=data)
output = json.loads(response.content.decode("utf-8"))
print(output)
person_names = set()
for text in output[0]:
if text["entity_group"] == "PER":
person_names.add(text["word"])
# Extract email addresses
print(text)
return set(person_names), set(emails)
# Function to extract key technical skills from resume text
def extract_tech_skills(_doc):
keywords = [token.text.upper() for token in _doc if token.text.lower() in TECH_SKILLS]
return set(keywords)
# Function to calculate overall percentage match between job description and resume
@cache_data(show_spinner=False)
def calculate_similarity(job_description, resume):
if job_description != '':
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
job_description_embeddings = model.encode(job_description)
resume_embeddings = model.encode(resume)
similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
return similarity_score[0][0] * 100
else:
return np.NaN
# Define a function to clean sentences
def clean_text(text):
# Remove bullet points
text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
# Remove more types of bullet points
text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
# Remove extra new lines
text = re.sub(r'\n+', '\n', text).strip()
# Remove any leading/trailing newlines
text = text.strip('\n')
# Remove any leading/trailing spaces
text = text.strip()
# Replace pipe symbol with a dot
text = re.sub(r'\s*\|\s*', '. ', text).strip()
# Add full stops to the end of each sentence
text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
# Capitalize the first letter of each sentence
text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
# Replace ' - ' with '. ' only if it's not part of a hyphenated word
text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
# Return cleaned text
return text
# Define a function to split sentences based on regular expressions
def split_text(string):
# Split the clean string into sentences
sentences = sent_tokenize(string)
return sentences
# Function to calculate overall percentage match
@cache_data(show_spinner=False)
def get_average_similarity_scores(job_description, resumes):
# Calculate cosine similarity matrix between job description and resumes
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
job_description_embeddings = model.encode(job_description)
resume_embeddings = model.encode(resumes)
similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
# Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
avg_similarity_scores = np.mean(similarity_matrix, axis=1)
# Return the average similarity scores as a list
return avg_similarity_scores.tolist()
# Function to respond to user Q&A
def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
pages = loader.load_and_split()
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(pages)
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embeddings)
llm = HuggingFaceHub(
repo_id=LLM_REPO_ID, model_kwargs={
"temperature": temperature, "max_length": max_length
})
chain = load_qa_chain(llm, chain_type="stuff")
docs = db.similarity_search(query)
return chain.run(input_documents=docs, question=query)
# Load the English language model for spaCy
lang_model = spacy.load("en_core_web_sm")