import requests import json import re import numpy as np from sentence_transformers import SentenceTransformer, util from sklearn.metrics.pairwise import cosine_similarity import spacy import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize # Langchain packages from langchain.text_splitter import CharacterTextSplitter #text splitter from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain from langchain import HuggingFaceHub from langchain.chains.question_answering import load_qa_chain from constants import StreamlitException from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID from streamlit import cache_data # Function to summarize resume text @cache_data(show_spinner=False) def summarize_text(text, max_length=100): if text != '': data = json.dumps( { "inputs": text, "parameters": {"max_length": max_length} } ) response = requests.post(API_URL_summary, headers=HEADERS, data=data) if response.status_code != 200: return StreamlitException(f"**Error**: {response.status_code}") try: summary = response.json()[0]["generated_text"] except (KeyError, IndexError): return StreamlitException("**Error**: Invalid response from API.") return summary else: return 'nan' # Function to extract candidate name(s) from resume text @cache_data(show_spinner=False) def extract_person_names_and_email(text): print(text) emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) data = json.dumps({"inputs": [text]}) response = requests.post(API_URL_name, headers=HEADERS, data=data) output = json.loads(response.content.decode("utf-8")) print(output) person_names = set() for text in output[0]: if text["entity_group"] == "PER": person_names.add(text["word"]) # Extract email addresses print(text) return set(person_names), set(emails) # Function to extract key technical skills from resume text def extract_tech_skills(_doc): keywords = [token.text.upper() for token in _doc if token.text.lower() in TECH_SKILLS] return set(keywords) # Function to calculate overall percentage match between job description and resume @cache_data(show_spinner=False) def calculate_similarity(job_description, resume): if job_description != '': model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL) job_description_embeddings = model.encode(job_description) resume_embeddings = model.encode(resume) similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings) return similarity_score[0][0] * 100 else: return np.NaN # Define a function to clean sentences def clean_text(text): # Remove bullet points text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip() # Remove more types of bullet points text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE) # Remove extra new lines text = re.sub(r'\n+', '\n', text).strip() # Remove any leading/trailing newlines text = text.strip('\n') # Remove any leading/trailing spaces text = text.strip() # Replace pipe symbol with a dot text = re.sub(r'\s*\|\s*', '. ', text).strip() # Add full stops to the end of each sentence text = re.sub(r'([^.!?])\s*\n', r'\1. ', text) # Capitalize the first letter of each sentence text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text) # Replace ' - ' with '. ' only if it's not part of a hyphenated word text = re.sub(r'(?